diff --git a/client/maxadmin.c b/client/maxadmin.c index b1d29ce3a..c406d704e 100644 --- a/client/maxadmin.c +++ b/client/maxadmin.c @@ -183,12 +183,22 @@ char c; len += strlen(argv[i]) + 1; } - cmd = malloc(len); - int arglen; - memcpy(cmd, argv[optind], (arglen = strlen(argv[optind])) < 8192 ? arglen : 8192); - for (i = optind +1; i < argc; i++) { + cmd = malloc(len + (2 * argc)); // Allow for quotes + strcpy(cmd, argv[optind]); + for (i = optind +1; i < argc; i++) + { strcat(cmd, " "); - strcat(cmd, argv[i]); + /* Arguments after the seconf are quoted to allow for names + * that contain white space + */ + if (i - optind > 1) + { + strcat(cmd, "\""); + strcat(cmd, argv[i]); + strcat(cmd, "\""); + } + else + strcat(cmd, argv[i]); } if (access(cmd, R_OK) == 0) diff --git a/rabbitmq_consumer/inih/._LICENSE.txt b/rabbitmq_consumer/inih/._LICENSE.txt deleted file mode 100755 index 7fa6eb8a6..000000000 Binary files a/rabbitmq_consumer/inih/._LICENSE.txt and /dev/null differ diff --git a/rabbitmq_consumer/inih/._README.txt b/rabbitmq_consumer/inih/._README.txt deleted file mode 100755 index 7fa6eb8a6..000000000 Binary files a/rabbitmq_consumer/inih/._README.txt and /dev/null differ diff --git a/rabbitmq_consumer/inih/._cpp b/rabbitmq_consumer/inih/._cpp deleted file mode 100755 index 17b8574a4..000000000 Binary files a/rabbitmq_consumer/inih/._cpp and /dev/null differ diff --git a/rabbitmq_consumer/inih/._examples b/rabbitmq_consumer/inih/._examples deleted file mode 100755 index 17b8574a4..000000000 Binary files a/rabbitmq_consumer/inih/._examples and /dev/null differ diff --git a/rabbitmq_consumer/inih/._extra b/rabbitmq_consumer/inih/._extra deleted file mode 100755 index 17b8574a4..000000000 Binary files a/rabbitmq_consumer/inih/._extra and /dev/null differ diff --git a/rabbitmq_consumer/inih/._ini.c b/rabbitmq_consumer/inih/._ini.c deleted file mode 100755 index 17b8574a4..000000000 Binary files a/rabbitmq_consumer/inih/._ini.c and /dev/null differ diff --git a/rabbitmq_consumer/inih/._ini.h b/rabbitmq_consumer/inih/._ini.h deleted file mode 100755 index 17b8574a4..000000000 Binary files a/rabbitmq_consumer/inih/._ini.h and /dev/null differ diff --git a/rabbitmq_consumer/inih/._tests b/rabbitmq_consumer/inih/._tests deleted file mode 100755 index 7fa6eb8a6..000000000 Binary files a/rabbitmq_consumer/inih/._tests and /dev/null differ diff --git a/rabbitmq_consumer/inih/cpp/._INIReader.cpp b/rabbitmq_consumer/inih/cpp/._INIReader.cpp deleted file mode 100755 index 17b8574a4..000000000 Binary files a/rabbitmq_consumer/inih/cpp/._INIReader.cpp and /dev/null differ diff --git a/rabbitmq_consumer/inih/cpp/._INIReader.h b/rabbitmq_consumer/inih/cpp/._INIReader.h deleted file mode 100755 index 17b8574a4..000000000 Binary files a/rabbitmq_consumer/inih/cpp/._INIReader.h and /dev/null differ diff --git a/rabbitmq_consumer/inih/cpp/._INIReaderTest.cpp b/rabbitmq_consumer/inih/cpp/._INIReaderTest.cpp deleted file mode 100755 index 17b8574a4..000000000 Binary files a/rabbitmq_consumer/inih/cpp/._INIReaderTest.cpp and /dev/null differ diff --git a/rabbitmq_consumer/inih/examples/._config.def b/rabbitmq_consumer/inih/examples/._config.def deleted file mode 100755 index 17b8574a4..000000000 Binary files a/rabbitmq_consumer/inih/examples/._config.def and /dev/null differ diff --git a/rabbitmq_consumer/inih/examples/._ini_dump.c b/rabbitmq_consumer/inih/examples/._ini_dump.c deleted file mode 100755 index 17b8574a4..000000000 Binary files a/rabbitmq_consumer/inih/examples/._ini_dump.c and /dev/null differ diff --git a/rabbitmq_consumer/inih/examples/._ini_example.c b/rabbitmq_consumer/inih/examples/._ini_example.c deleted file mode 100755 index 17b8574a4..000000000 Binary files a/rabbitmq_consumer/inih/examples/._ini_example.c and /dev/null differ diff --git a/rabbitmq_consumer/inih/examples/._ini_xmacros.c b/rabbitmq_consumer/inih/examples/._ini_xmacros.c deleted file mode 100755 index 17b8574a4..000000000 Binary files a/rabbitmq_consumer/inih/examples/._ini_xmacros.c and /dev/null differ diff --git a/rabbitmq_consumer/inih/examples/._test.ini b/rabbitmq_consumer/inih/examples/._test.ini deleted file mode 100755 index 17b8574a4..000000000 Binary files a/rabbitmq_consumer/inih/examples/._test.ini and /dev/null differ diff --git a/rabbitmq_consumer/inih/extra/._Makefile.static b/rabbitmq_consumer/inih/extra/._Makefile.static deleted file mode 100755 index 17b8574a4..000000000 Binary files a/rabbitmq_consumer/inih/extra/._Makefile.static and /dev/null differ diff --git a/rabbitmq_consumer/inih/tests/._bad_comment.ini b/rabbitmq_consumer/inih/tests/._bad_comment.ini deleted file mode 100755 index 7fa6eb8a6..000000000 Binary files a/rabbitmq_consumer/inih/tests/._bad_comment.ini and /dev/null differ diff --git a/rabbitmq_consumer/inih/tests/._bad_multi.ini b/rabbitmq_consumer/inih/tests/._bad_multi.ini deleted file mode 100755 index 7fa6eb8a6..000000000 Binary files a/rabbitmq_consumer/inih/tests/._bad_multi.ini and /dev/null differ diff --git a/rabbitmq_consumer/inih/tests/._bad_section.ini b/rabbitmq_consumer/inih/tests/._bad_section.ini deleted file mode 100755 index 7fa6eb8a6..000000000 Binary files a/rabbitmq_consumer/inih/tests/._bad_section.ini and /dev/null differ diff --git a/rabbitmq_consumer/inih/tests/._baseline_multi.txt b/rabbitmq_consumer/inih/tests/._baseline_multi.txt deleted file mode 100755 index 7fa6eb8a6..000000000 Binary files a/rabbitmq_consumer/inih/tests/._baseline_multi.txt and /dev/null differ diff --git a/rabbitmq_consumer/inih/tests/._baseline_single.txt b/rabbitmq_consumer/inih/tests/._baseline_single.txt deleted file mode 100755 index 7fa6eb8a6..000000000 Binary files a/rabbitmq_consumer/inih/tests/._baseline_single.txt and /dev/null differ diff --git a/rabbitmq_consumer/inih/tests/._bom.ini b/rabbitmq_consumer/inih/tests/._bom.ini deleted file mode 100755 index 7fa6eb8a6..000000000 Binary files a/rabbitmq_consumer/inih/tests/._bom.ini and /dev/null differ diff --git a/rabbitmq_consumer/inih/tests/._multi_line.ini b/rabbitmq_consumer/inih/tests/._multi_line.ini deleted file mode 100755 index 7fa6eb8a6..000000000 Binary files a/rabbitmq_consumer/inih/tests/._multi_line.ini and /dev/null differ diff --git a/rabbitmq_consumer/inih/tests/._normal.ini b/rabbitmq_consumer/inih/tests/._normal.ini deleted file mode 100755 index 7fa6eb8a6..000000000 Binary files a/rabbitmq_consumer/inih/tests/._normal.ini and /dev/null differ diff --git a/rabbitmq_consumer/inih/tests/._unittest.bat b/rabbitmq_consumer/inih/tests/._unittest.bat deleted file mode 100755 index 7fa6eb8a6..000000000 Binary files a/rabbitmq_consumer/inih/tests/._unittest.bat and /dev/null differ diff --git a/rabbitmq_consumer/inih/tests/._unittest.c b/rabbitmq_consumer/inih/tests/._unittest.c deleted file mode 100755 index 7fa6eb8a6..000000000 Binary files a/rabbitmq_consumer/inih/tests/._unittest.c and /dev/null differ diff --git a/rabbitmq_consumer/inih/tests/._user_error.ini b/rabbitmq_consumer/inih/tests/._user_error.ini deleted file mode 100755 index 7fa6eb8a6..000000000 Binary files a/rabbitmq_consumer/inih/tests/._user_error.ini and /dev/null differ diff --git a/server/core/CMakeLists.txt b/server/core/CMakeLists.txt index bddf95239..b60999901 100644 --- a/server/core/CMakeLists.txt +++ b/server/core/CMakeLists.txt @@ -4,9 +4,10 @@ if(BUILD_TESTS) target_link_libraries(fullcore log_manager utils pthread ${EMBEDDED_LIB} ssl aio rt crypt dl crypto inih z m stdc++) endif() add_executable(maxscale atomic.c buffer.c spinlock.c gateway.c - gw_utils.c utils.c dcb.c load_utils.c session.c service.c server.c - poll.c config.c users.c hashtable.c dbusers.c thread.c gwbitmask.c - monitor.c adminusers.c secrets.c filter.c modutil.c hint.c housekeeper.c) + gw_utils.c utils.c dcb.c load_utils.c session.c service.c server.c + poll.c config.c users.c hashtable.c dbusers.c thread.c gwbitmask.c + monitor.c adminusers.c secrets.c filter.c modutil.c hint.c + housekeeper.c memlog.c) target_link_libraries(maxscale ${EMBEDDED_LIB} log_manager utils ssl aio pthread crypt dl crypto inih z rt m stdc++) install(TARGETS maxscale DESTINATION bin) @@ -20,4 +21,4 @@ install(TARGETS maxpasswd DESTINATION bin) if(BUILD_TESTS) add_subdirectory(test) -endif() \ No newline at end of file +endif() diff --git a/server/core/Makefile b/server/core/Makefile index 26c6482eb..4eab4162c 100644 --- a/server/core/Makefile +++ b/server/core/Makefile @@ -65,7 +65,8 @@ include ../../makefile.inc SRCS= atomic.c buffer.c spinlock.c gateway.c \ gw_utils.c utils.c dcb.c load_utils.c session.c service.c server.c \ poll.c config.c users.c hashtable.c dbusers.c thread.c gwbitmask.c \ - monitor.c adminusers.c secrets.c filter.c modutil.c hint.c housekeeper.c + monitor.c adminusers.c secrets.c filter.c modutil.c hint.c \ + housekeeper.c memlog.c HDRS= ../include/atomic.h ../include/buffer.h ../include/dcb.h \ ../include/gw.h ../modules/include/mysql_client_server_protocol.h \ @@ -73,7 +74,8 @@ HDRS= ../include/atomic.h ../include/buffer.h ../include/dcb.h \ ../include/modules.h ../include/poll.h ../include/config.h \ ../include/users.h ../include/hashtable.h ../include/gwbitmask.h \ ../include/adminusers.h ../include/version.h ../include/maxscale.h \ - ../include/filter.h ../include/modutil.h ../hint.h ../include/housekeeper.h + ../include/filter.h ../include/modutil.h ../hint.h \ + ../include/housekeeper.h ../include/memlog.h OBJ=$(SRCS:.c=.o) diff --git a/server/core/config.c b/server/core/config.c index cc5886c03..737781d7f 100644 --- a/server/core/config.c +++ b/server/core/config.c @@ -1150,6 +1150,31 @@ config_threadcount() return gateway.n_threads; } +/** + * Return the number of non-blocking polls to be done before a blocking poll + * is issued. + * + * @return The number of blocking poll calls to make before a blocking call + */ +unsigned int +config_nbpolls() +{ + return gateway.n_nbpoll; +} + +/** + * Return the configured number of milliseconds for which we wait when we do + * a blocking poll call. + * + * @return The number of milliseconds to sleep in a blocking poll call + */ +unsigned int +config_pollsleep() +{ + return gateway.pollsleep; +} + + static struct { char *logname; logfile_id_t logfile; @@ -1170,9 +1195,20 @@ static int handle_global_item(const char *name, const char *value) { int i; - if (strcmp(name, "threads") == 0) { + if (strcmp(name, "threads") == 0) + { gateway.n_threads = atoi(value); - } else { + } + else if (strcmp(name, "non_blocking_polls") == 0) + { + gateway.n_nbpoll = atoi(value); + } + else if (strcmp(name, "poll_sleep") == 0) + { + gateway.pollsleep = atoi(value); + } + else + { for (i = 0; lognames[i].logname; i++) { if (strcasecmp(name, lognames[i].logname) == 0) @@ -1194,6 +1230,8 @@ static void global_defaults() { gateway.n_threads = 1; + gateway.n_nbpoll = DEFAULT_NBPOLLS; + gateway.pollsleep = DEFAULT_POLLSLEEP; if (version_string != NULL) gateway.version_string = strdup(version_string); else diff --git a/server/core/dcb.c b/server/core/dcb.c index a4037eaa5..6e667b00a 100644 --- a/server/core/dcb.c +++ b/server/core/dcb.c @@ -206,6 +206,7 @@ DCB *rval; rval->low_water = 0; rval->next = NULL; rval->callbacks = NULL; + rval->data = NULL; rval->remote = NULL; rval->user = NULL; @@ -240,7 +241,7 @@ dcb_free(DCB *dcb) { LOGIF(LE, (skygw_log_write_flush( LOGFILE_ERROR, - "Error : Attempt to free a DCB via dcb_fee " + "Error : Attempt to free a DCB via dcb_free " "that has been associated with a descriptor."))); } } @@ -342,6 +343,15 @@ DCB_CALLBACK *cb; dcb->state == DCB_STATE_ALLOC, "dcb not in DCB_STATE_DISCONNECTED not in DCB_STATE_ALLOC state."); + if (DCB_POLL_BUSY(dcb)) + { + /* Check if DCB has outstanding poll events */ + LOGIF(LE, (skygw_log_write_flush( + LOGFILE_ERROR, + "dcb_final_free: DCB %p has outstanding events", + dcb))); + } + /*< First remove this DCB from the chain */ spinlock_acquire(&dcbspin); if (allDCBs == dcb) @@ -413,6 +423,7 @@ DCB_CALLBACK *cb; } spinlock_release(&dcb->cb_lock); + bitmask_free(&dcb->memdata.bitmask); free(dcb); } @@ -1208,7 +1219,8 @@ dcb_close(DCB *dcb) */ if (dcb->state == DCB_STATE_POLLING) { - rc = poll_remove_dcb(dcb); + if (dcb->fd != -1) + rc = poll_remove_dcb(dcb); if (rc == 0) { LOGIF(LD, (skygw_log_write( @@ -1276,9 +1288,9 @@ printDCB(DCB *dcb) dcb->stats.n_buffered); printf("\t\tNo. of Accepts: %d\n", dcb->stats.n_accepts); - printf("\t\tNo. of High Water Events: %d\n", + printf("\t\tNo. of High Water Events: %d\n", dcb->stats.n_high_water); - printf("\t\tNo. of Low Water Events: %d\n", + printf("\t\tNo. of Low Water Events: %d\n", dcb->stats.n_low_water); } /** @@ -1463,6 +1475,12 @@ dprintDCB(DCB *pdcb, DCB *dcb) dcb->stats.n_high_water); dcb_printf(pdcb, "\t\tNo. of Low Water Events: %d\n", dcb->stats.n_low_water); + if (DCB_POLL_BUSY(dcb)) + { + dcb_printf(pdcb, "\t\tPending events in the queue: %x %s\n", + dcb->evq.pending_events, dcb->evq.processing ? "(processing)" : ""); + + } if (dcb->flags & DCBF_CLONE) dcb_printf(pdcb, "\t\tDCB is a clone.\n"); #if SPINLOCK_PROFILE diff --git a/server/core/gateway.c b/server/core/gateway.c index fce45e425..8ea29813f 100644 --- a/server/core/gateway.c +++ b/server/core/gateway.c @@ -56,6 +56,7 @@ #include #include #include +#include #include #include @@ -1548,13 +1549,10 @@ int main(int argc, char **argv) * instances of the gateway are beign run on the same * machine. */ - sprintf(datadir, "%s/data%d", home_dir, getpid()); - if(mkdir(datadir, 0777) != 0){ - LOGIF(LE,(skygw_log_write_flush( - LOGFILE_ERROR, - "Error : Directory creation failed due to %s.", - strerror(errno)))); - } + sprintf(datadir, "%s/data", home_dir); + mkdir(datadir, 0777); + sprintf(datadir, "%s/data/data%d", home_dir, getpid()); + mkdir(datadir, 0777); if (!daemon_mode) { @@ -1758,6 +1756,7 @@ int main(int argc, char **argv) LOGFILE_MESSAGE, "MaxScale shutdown completed."))); + unload_all_modules(); /* Remove Pidfile */ unlink_pidfile(); @@ -1776,10 +1775,11 @@ return_main: * Shutdown MaxScale server */ void - shutdown_server() +shutdown_server() { poll_shutdown(); hkshutdown(); + memlog_flush_all(); log_flush_shutdown(); } diff --git a/server/core/housekeeper.c b/server/core/housekeeper.c index 50556b0c6..2225e628f 100644 --- a/server/core/housekeeper.c +++ b/server/core/housekeeper.c @@ -24,11 +24,21 @@ /** * @file housekeeper.c Provide a mechanism to run periodic tasks * + * The housekeeper provides a mechanism to allow for tasks, function + * calls basically, to be run on a tiem basis. A task may be run + * repeatedly, with a given frequency (in seconds), or may be a one + * shot task that will only be run once after a specified number of + * seconds. + * + * The housekeeper also maintains a global variable, hkheartbeat, that + * is incremented every 100ms. + * * @verbatim * Revision History * * Date Who Description * 29/08/14 Mark Riddoch Initial implementation + * 22/10/14 Mark Riddoch Addition of one-shot tasks * * @endverbatim */ @@ -43,6 +53,7 @@ static HKTASK *tasks = NULL; static SPINLOCK tasklock = SPINLOCK_INIT; static int do_shutdown = 0; +unsigned long hkheartbeat = 0; static void hkthread(void *); @@ -69,7 +80,7 @@ hkinit() * @param taskfn The function to call for the task * @param data Data to pass to the task function * @param frequency How often to run the task, expressed in seconds - * @return Return the tiem in seconds when the task will be first run if the task was added, otherwise 0 + * @return Return the time in seconds when the task will be first run if the task was added, otherwise 0 */ int hktask_add(char *name, void (*taskfn)(void *), void *data, int frequency) @@ -88,6 +99,7 @@ HKTASK *task, *ptr; task->task = taskfn; task->data = data; task->frequency = frequency; + task->type = HK_REPEATED; task->nextdue = time(0) + frequency; task->next = NULL; spinlock_acquire(&tasklock); @@ -112,6 +124,61 @@ HKTASK *task, *ptr; return task->nextdue; } +/** + * Add a one-shot task to the housekeeper task list + * + * Task names must be unique. + * + * @param name The unique name for this housekeeper task + * @param taskfn The function to call for the task + * @param data Data to pass to the task function + * @param when How many second until the task is executed + * @return Return the time in seconds when the task will be first run if the task was added, otherwise 0 + * + */ +int +hktask_oneshot(char *name, void (*taskfn)(void *), void *data, int when) +{ +HKTASK *task, *ptr; + + if ((task = (HKTASK *)malloc(sizeof(HKTASK))) == NULL) + { + return 0; + } + if ((task->name = strdup(name)) == NULL) + { + free(task); + return 0; + } + task->task = taskfn; + task->data = data; + task->frequency = 0; + task->type = HK_ONESHOT; + task->nextdue = time(0) + when; + task->next = NULL; + spinlock_acquire(&tasklock); + ptr = tasks; + while (ptr && ptr->next) + { + if (strcmp(ptr->name, name) == 0) + { + spinlock_release(&tasklock); + free(task->name); + free(task); + return 0; + } + ptr = ptr->next; + } + if (ptr) + ptr->next = task; + else + tasks = task; + spinlock_release(&tasklock); + + return task->nextdue; +} + + /** * Remove a named task from the housekeepers task list * @@ -171,12 +238,17 @@ HKTASK *ptr; time_t now; void (*taskfn)(void *); void *taskdata; +int i; for (;;) { - if (do_shutdown) - return; - thread_millisleep(1000); + for (i = 0; i < 10; i++) + { + if (do_shutdown) + return; + thread_millisleep(100); + hkheartbeat++; + } now = time(0); spinlock_acquire(&tasklock); ptr = tasks; @@ -189,6 +261,8 @@ void *taskdata; taskdata = ptr->data; spinlock_release(&tasklock); (*taskfn)(taskdata); + if (ptr->type == HK_ONESHOT) + hktask_remove(ptr->name); spinlock_acquire(&tasklock); ptr = tasks; } @@ -208,3 +282,33 @@ hkshutdown() { do_shutdown = 1; } + +/** + * Show the tasks that are scheduled for the house keeper + * + * @param pdcb The DCB to send to output + */ +void +hkshow_tasks(DCB *pdcb) +{ +HKTASK *ptr; +struct tm tm; +char buf[40]; + + dcb_printf(pdcb, "%-25s | Type | Frequency | Next Due\n", "Name"); + dcb_printf(pdcb, "--------------------------+----------+-----------+-------------------------\n"); + spinlock_acquire(&tasklock); + ptr = tasks; + while (ptr) + { + localtime_r(&ptr->nextdue, &tm); + asctime_r(&tm, buf); + dcb_printf(pdcb, "%-25s | %-8s | %-9d | %s", + ptr->name, + ptr->type == HK_REPEATED ? "Repeated" : "One-Shot", + ptr->frequency, + buf); + ptr = ptr->next; + } + spinlock_release(&tasklock); +} diff --git a/server/core/load_utils.c b/server/core/load_utils.c index 2d1f5ecd8..3fe975f5c 100644 --- a/server/core/load_utils.c +++ b/server/core/load_utils.c @@ -330,12 +330,28 @@ MODULES *ptr; * The module is now not in the linked list and all * memory related to it can be freed */ + dlclose(mod->handle); free(mod->module); free(mod->type); free(mod->version); free(mod); } +/** + * Unload all modules + * + * Remove all the modules from the system, called during shutdown + * to allow termination hooks to be called. + */ +void +unload_all_modules() +{ + while (registered) + { + unregister_module(registered->module); + } +} + /** * Print Modules * diff --git a/server/core/memlog.c b/server/core/memlog.c new file mode 100644 index 000000000..43df8c5da --- /dev/null +++ b/server/core/memlog.c @@ -0,0 +1,254 @@ +/* + * This file is distributed as part of the MariaDB MaxScale. It is free + * software: you can redistribute it and/or modify it under the terms of the + * GNU General Public License as published by the Free Software Foundation, + * version 2. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Copyright MariaDB Ab 2014 + */ + +/** + * @file memlog.c - Implementation of memory logging mechanism for debug purposes + * + * @verbatim + * Revision History + * + * Date Who Description + * 26/09/14 Mark Riddoch Initial implementation + * + * @endverbatim + */ +#include +#include + +static MEMLOG *memlogs = NULL; +static SPINLOCK *memlock = SPINLOCK_INIT; + +/** + * Create a new instance of a memory logger. + * + * @param name The name of the memory log + * @param type The type of item being logged + * @param size The number of items to store in memory before flushign to disk + * + * @return MEMLOG* A memory log handle + */ +MEMLOG * +memlog_create(char *name, MEMLOGTYPE type, int size) +{ +MEMLOG *log; + + if ((log = (MEMLOG *)malloc(sizeof(MEMLOG))) == NULL) + { + return NULL; + } + + log->name = strdup(name); + spinlock_init(&log->lock); + log->type = type; + log->offset = 0; + log->size = size; + log->flags = 0; + switch (type) + { + case ML_INT: + log->values = malloc(sizeof(int) * size); + break; + case ML_LONG: + log->values = malloc(sizeof(long) * size); + break; + case ML_LONGLONG: + log->values = malloc(sizeof(long long) * size); + break; + case ML_STRING: + log->values = malloc(sizeof(char *) * size); + break; + } + if (log->values == NULL) + { + free(log); + return NULL; + } + spinlock_acquire(&memlock); + log->next = memlogs; + memlogs = log; + spinlock_release(&memlock); + + return log; +} + +/** + * Destroy a memory logger any unwritten data will be flushed to disk + * + * @param log The memory log to destroy + */ +void +memlog_destroy(MEMLOG *log) +{ +MEMLOG *ptr; + + if ((log->flags & MLNOAUTOFLUSH) == 0) + memlog_flush(log); + free(log->values); + + spinlock_acquire(&memlock); + if (memlogs == log) + memlogs = log->next; + else + { + ptr = memlogs; + while (ptr && ptr->next != log) + ptr = ptr->next; + if (ptr) + ptr->next = log->next; + } + spinlock_release(&memlock); + free(log->name); + free(log); +} + +/** + * Log a data item to the memory logger + * + * @param log The memory logger + * @param value The value to log + */ +void +memlog_log(MEMLOG *log, void *value) +{ + if (!log) + return; + spinlock_acquire(&log->lock); + switch (log->type) + { + case ML_INT: + ((int *)(log->values))[log->offset] = (int)value; + break; + case ML_LONG: + ((long *)(log->values))[log->offset] = (long)value; + break; + case ML_LONGLONG: + ((long long *)(log->values))[log->offset] = (long long)value; + break; + case ML_STRING: + ((char **)(log->values))[log->offset] = (char *)value; + break; + } + log->offset++; + if (log->offset == log->size) + { + if ((log->flags & MLNOAUTOFLUSH) == 0) + memlog_flush(log); + log->offset = 0; + log->iflags = MLWRAPPED; + } + spinlock_release(&log->lock); +} + +/** + * Flush all memlogs to disk, called during shutdown + * + */ +void +memlog_flush_all() +{ +MEMLOG *log; + + spinlock_acquire(&memlock); + log = memlogs; + while (log) + { + spinlock_acquire(&log->lock); + memlog_flush(log); + spinlock_release(&log->lock); + log = log->next; + } + spinlock_release(&memlock); +} + +/** + * Set the flags for a memlog + * + * @param log The memlog to set the flags for + * @param flags The new flags values + */ +void +memlog_set(MEMLOG *log, unsigned int flags) +{ + log->flags = flags; +} + +/** + * Flush a memory log to disk + * + * Assumes the the log->lock has been acquired by the caller + * + * @param log The memory log to flush + */ +void +memlog_flush(MEMLOG *log) +{ +FILE *fp; +int i; + + if ((fp = fopen(log->name, "a")) == NULL) + return; + if ((log->flags & MLNOAUTOFLUSH) && (log->iflags & MLWRAPPED)) + { + for (i = 0; i < log->size; i++) + { + int ind = (i + log->offset) % log->size; + switch (log->type) + { + case ML_INT: + fprintf(fp, "%d\n", + ((int *)(log->values))[ind]); + break; + case ML_LONG: + fprintf(fp, "%ld\n", + ((long *)(log->values))[ind]); + break; + case ML_LONGLONG: + fprintf(fp, "%lld\n", + ((long long *)(log->values))[ind]); + break; + case ML_STRING: + fprintf(fp, "%s\n", + ((char **)(log->values))[ind]); + break; + } + } + } + else + { + for (i = 0; i < log->offset; i++) + { + switch (log->type) + { + case ML_INT: + fprintf(fp, "%d\n", ((int *)(log->values))[i]); + break; + case ML_LONG: + fprintf(fp, "%ld\n", ((long *)(log->values))[i]); + break; + case ML_LONGLONG: + fprintf(fp, "%lld\n", ((long long *)(log->values))[i]); + break; + case ML_STRING: + fprintf(fp, "%s\n", ((char **)(log->values))[i]); + break; + } + } + } + log->offset = 0; + fclose(fp); +} diff --git a/server/core/poll.c b/server/core/poll.c index 7a5b31d85..ecf17643c 100644 --- a/server/core/poll.c +++ b/server/core/poll.c @@ -33,10 +33,14 @@ #include #include -#define PROFILE_POLL 1 +#define PROFILE_POLL 0 #if PROFILE_POLL #include +#include + +extern unsigned long hkheartbeat; +MEMLOG *plog; #endif /** Defined in log_manager.cc */ @@ -44,6 +48,9 @@ extern int lm_enabled_logfiles_bitmask; extern size_t log_ses_count[]; extern __thread log_info_t tls_log_info; +int number_poll_spins; +int max_poll_sleep; + /** * @file poll.c - Abstraction of the epoll functionality * @@ -70,7 +77,7 @@ extern __thread log_info_t tls_log_info; /** * Control the use of mutexes for the epoll_wait call. Setting to 1 will * cause the epoll_wait calls to be moved under a mutex. This may be useful - * for debuggign purposes but should be avoided in general use. + * for debugging purposes but should be avoided in general use. */ #define MUTEX_EPOLL 0 @@ -97,6 +104,7 @@ static int load_samples = 0; static int load_nfds = 0; static double current_avg = 0.0; static double *avg_samples = NULL; +static int *evqp_samples = NULL; static int next_sample = 0; static int n_avg_samples; @@ -144,13 +152,29 @@ static struct { int n_hup; /*< Number of hangup events */ int n_accept; /*< Number of accept events */ int n_polls; /*< Number of poll cycles */ + int n_pollev; /*< Number of polls returnign events */ + int n_nbpollev; /*< Number of polls returnign events */ int n_nothreads; /*< Number of times no threads are polling */ int n_fds[MAXNFDS]; /*< Number of wakeups with particular n_fds value */ int evq_length; /*< Event queue length */ + int evq_pending; /*< Number of pending descriptors in event queue */ int evq_max; /*< Maximum event queue length */ + int wake_evqpending;/*< Woken from epoll_wait with pending events in queue */ + int blockingpolls; /*< Number of epoll_waits with a timeout specified */ } pollStats; +#define N_QUEUE_TIMES 30 +/** + * The event queue statistics + */ +static struct { + unsigned int qtimes[N_QUEUE_TIMES+1]; + unsigned int exectimes[N_QUEUE_TIMES+1]; + unsigned long maxqtime; + unsigned long maxexectime; +} queueStats; + /** * How frequently to call the poll_loadav function used to monitor the load * average of the poll subsystem. @@ -179,6 +203,7 @@ int i; exit(-1); } memset(&pollStats, 0, sizeof(pollStats)); + memset(&queueStats, 0, sizeof(queueStats)); bitmask_init(&poll_mask); n_threads = config_threadcount(); if ((thread_data = @@ -195,10 +220,19 @@ int i; hktask_add("Load Average", poll_loadav, NULL, POLL_LOAD_FREQ); n_avg_samples = 15 * 60 / POLL_LOAD_FREQ; - avg_samples = (double *)malloc(sizeof(double *) * n_avg_samples); + avg_samples = (double *)malloc(sizeof(double) * n_avg_samples); for (i = 0; i < n_avg_samples; i++) avg_samples[i] = 0.0; + evqp_samples = (int *)malloc(sizeof(int) * n_avg_samples); + for (i = 0; i < n_avg_samples; i++) + evqp_samples[i] = 0.0; + number_poll_spins = config_nbpolls(); + max_poll_sleep = config_pollsleep(); + +#if PROFILE_POLL + plog = memlog_create("EventQueueWaitTime", ML_LONG, 10000); +#endif } /** @@ -362,7 +396,7 @@ return_rc: * deschedule a process if a timeout is included, but will not do this if a 0 timeout * value is given. this improves performance when the gateway is under heavy load. * - * In order to provide a fairer means of sharign the threads between the different + * In order to provide a fairer means of sharing the threads between the different * DCB's the poll mechanism has been decoupled from the processing of the events. * The events are now recieved via the epoll_wait call, a queue of DCB's that have * events pending is maintained and as new events arrive the DCB is added to the end @@ -373,15 +407,33 @@ return_rc: * events at a high rate will not block the execution of events for other DCB's and * should result in a fairer polling strategy. * + * The introduction of the ability to inject "fake" write events into the event queue meant + * that there was a possibility to "starve" new events sicne the polling loop would + * consume the event queue before looking for new events. If the DCB that inject + * the fake event then injected another fake event as a result of the first it meant + * that new events did not get added to the queue. The strategy has been updated to + * not consume the entire event queue, but process one event before doing a non-blocking + * call to add any new events before processing any more events. A blocking call to + * collect events is only made if there are no pending events to be processed on the + * event queue. + * + * Also introduced a "timeout bias" mechanism. This mechansim control the length of + * of timeout passed to epoll_wait in blocking calls based on previous behaviour. + * The initial call will block for 10% of the define timeout peroid, this will be + * increased in increments of 10% until the full timeout value is used. If at any + * point there is an event to be processed then the value will be reduced to 10% again + * for the next blocking call. + * * @param arg The thread ID passed as a void * to satisfy the threading package */ void poll_waitevents(void *arg) { struct epoll_event events[MAX_EVENTS]; -int i, nfds; +int i, nfds, timeout_bias = 1; int thread_id = (int)arg; DCB *zombies = NULL; +int poll_spins = 0; /** Add this thread to the bitmask of running polling threads */ bitmask_set(&poll_mask, thread_id); @@ -395,12 +447,9 @@ DCB *zombies = NULL; while (1) { - /* Process of the queue of waiting requests */ - while (do_shutdown == 0 && process_pollq(thread_id)) + if (pollStats.evq_pending == 0 && timeout_bias < 10) { - if (thread_data) - thread_data[thread_id].state = THREAD_ZPROCESSING; - zombies = dcb_process_zombies(thread_id); + timeout_bias++; } atomic_add(&n_waiting, 1); @@ -416,6 +465,7 @@ DCB *zombies = NULL; thread_data[thread_id].state = THREAD_POLLING; } + atomic_add(&pollStats.n_polls, 1); if ((nfds = epoll_wait(epoll_fd, events, MAX_EVENTS, 0)) == -1) { atomic_add(&n_waiting, -1); @@ -428,19 +478,28 @@ DCB *zombies = NULL; pthread_self(), nfds, eno))); + atomic_add(&n_waiting, -1); } /* * If there are no new descriptors from the non-blocking call - * and nothing to proces on the event queue then for do a + * and nothing to process on the event queue then for do a * blocking call to epoll_wait. + * + * We calculate a timeout bias to alter the length of the blocking + * call based on the time since we last received an event to process */ - else if (nfds == 0 && process_pollq(thread_id) == 0) + else if (nfds == 0 && pollStats.evq_pending == 0 && poll_spins++ > number_poll_spins) { - atomic_add(&n_waiting, 1); + atomic_add(&pollStats.blockingpolls, 1); nfds = epoll_wait(epoll_fd, events, MAX_EVENTS, - EPOLL_TIMEOUT); + (max_poll_sleep * timeout_bias) / 10); + if (nfds == 0 && pollStats.evq_pending) + { + atomic_add(&pollStats.wake_evqpending, 1); + poll_spins = 0; + } } else { @@ -455,12 +514,16 @@ DCB *zombies = NULL; #endif /* BLOCKINGPOLL */ if (nfds > 0) { + timeout_bias = 1; + if (poll_spins <= number_poll_spins + 1) + atomic_add(&pollStats.n_nbpollev, 1); + poll_spins = 0; LOGIF(LD, (skygw_log_write( LOGFILE_DEBUG, "%lu [poll_waitevents] epoll_wait found %d fds", pthread_self(), nfds))); - atomic_add(&pollStats.n_polls, 1); + atomic_add(&pollStats.n_pollev, 1); if (thread_data) { thread_data[thread_id].n_fds = nfds; @@ -479,7 +542,7 @@ DCB *zombies = NULL; /* * Process every DCB that has a new event and add * it to the poll queue. - * If the DCB is currently beign processed then we + * If the DCB is currently being processed then we * or in the new eent bits to the pending event bits * and leave it in the queue. * If the DCB was not already in the queue then it was @@ -494,6 +557,11 @@ DCB *zombies = NULL; spinlock_acquire(&pollqlock); if (DCB_POLL_BUSY(dcb)) { + if (dcb->evq.pending_events == 0) + { + pollStats.evq_pending++; + dcb->evq.inserted = hkheartbeat; + } dcb->evq.pending_events |= ev; } else @@ -513,6 +581,8 @@ DCB *zombies = NULL; dcb->evq.next = dcb; } pollStats.evq_length++; + pollStats.evq_pending++; + dcb->evq.inserted = hkheartbeat; if (pollStats.evq_length > pollStats.evq_max) { pollStats.evq_max = pollStats.evq_length; @@ -523,17 +593,20 @@ DCB *zombies = NULL; } /* - * If there was nothing to process then process the zombie queue + * Process of the queue of waiting requests + * This is done without checking the evq_pending count as a + * precautionary measure to avoid issues if the house keeping + * of the count goes wrong. */ - if (process_pollq(thread_id) == 0) - { - if (thread_data) - { - thread_data[thread_id].state = THREAD_ZPROCESSING; - } - zombies = dcb_process_zombies(thread_id); - } - + if (process_pollq(thread_id)) + timeout_bias = 1; + + if (thread_data) + thread_data[thread_id].state = THREAD_ZPROCESSING; + zombies = dcb_process_zombies(thread_id); + if (thread_data) + thread_data[thread_id].state = THREAD_IDLE; + if (do_shutdown) { /*< @@ -556,6 +629,34 @@ DCB *zombies = NULL; } /*< while(1) */ } +/** + * Set the number of non-blocking poll cycles that will be done before + * a blocking poll will take place. Whenever an event arrives on a thread + * or the thread sees a pending event to execute it will reset it's + * poll_spin coutn to zero and will then poll with a 0 timeout until the + * poll_spin value is greater than the value set here. + * + * @param nbpolls Number of non-block polls to perform before blocking + */ +void +poll_set_nonblocking_polls(unsigned int nbpolls) +{ + number_poll_spins = nbpolls; +} + +/** + * Set the maximum amount of time, in milliseconds, the polling thread + * will block before it will wake and check the event queue for work + * that may have been added by another thread. + * + * @param maxwait Maximum wait time in milliseconds + */ +void +poll_set_maxwait(unsigned int maxwait) +{ + max_poll_sleep = maxwait; +} + /** * Process of the queue of DCB's that have outstanding events * @@ -576,6 +677,7 @@ process_pollq(int thread_id) DCB *dcb; int found = 0; uint32_t ev; +unsigned long qtime; spinlock_acquire(&pollqlock); if (eventq == NULL) @@ -612,13 +714,28 @@ uint32_t ev; if (found) { ev = dcb->evq.pending_events; + dcb->evq.processing_events = ev; dcb->evq.pending_events = 0; + pollStats.evq_pending--; } spinlock_release(&pollqlock); if (found == 0) return 0; +#if PROFILE_POLL + memlog_log(plog, hkheartbeat - dcb->evq.inserted); +#endif + qtime = hkheartbeat - dcb->evq.inserted; + dcb->evq.started = hkheartbeat; + + if (qtime > N_QUEUE_TIMES) + queueStats.qtimes[N_QUEUE_TIMES]++; + else + queueStats.qtimes[qtime]++; + if (qtime > queueStats.maxqtime) + queueStats.maxqtime = qtime; + CHK_DCB(dcb); if (thread_data) @@ -836,7 +953,18 @@ uint32_t ev; spinlock_release(&dcb->dcb_initlock); } #endif + qtime = hkheartbeat - dcb->evq.started; + + if (qtime > N_QUEUE_TIMES) + queueStats.exectimes[N_QUEUE_TIMES]++; + else + queueStats.exectimes[qtime % N_QUEUE_TIMES]++; + if (qtime > queueStats.maxexectime) + queueStats.maxexectime = qtime; + spinlock_acquire(&pollqlock); + dcb->evq.processing_events = 0; + if (dcb->evq.pending_events == 0) { /* No pending events so remove from the queue */ @@ -930,24 +1058,35 @@ dprintPollStats(DCB *dcb) { int i; - dcb_printf(dcb, "Number of epoll cycles: %d\n", + dcb_printf(dcb, "\nPoll Statistics.\n\n"); + dcb_printf(dcb, "No. of epoll cycles: %d\n", pollStats.n_polls); - dcb_printf(dcb, "Number of read events: %d\n", + dcb_printf(dcb, "No. of epoll cycles with wait: %d\n", + pollStats.blockingpolls); + dcb_printf(dcb, "No. of epoll calls returning events: %d\n", + pollStats.n_pollev); + dcb_printf(dcb, "No. of non-blocking calls returning events: %d\n", + pollStats.n_nbpollev); + dcb_printf(dcb, "No. of read events: %d\n", pollStats.n_read); - dcb_printf(dcb, "Number of write events: %d\n", + dcb_printf(dcb, "No. of write events: %d\n", pollStats.n_write); - dcb_printf(dcb, "Number of error events: %d\n", + dcb_printf(dcb, "No. of error events: %d\n", pollStats.n_error); - dcb_printf(dcb, "Number of hangup events: %d\n", + dcb_printf(dcb, "No. of hangup events: %d\n", pollStats.n_hup); - dcb_printf(dcb, "Number of accept events: %d\n", + dcb_printf(dcb, "No. of accept events: %d\n", pollStats.n_accept); - dcb_printf(dcb, "Number of times no threads polling: %d\n", + dcb_printf(dcb, "No. of times no threads polling: %d\n", pollStats.n_nothreads); - dcb_printf(dcb, "Current event queue length: %d\n", + dcb_printf(dcb, "Current event queue length: %d\n", pollStats.evq_length); - dcb_printf(dcb, "Maximum event queue length: %d\n", + dcb_printf(dcb, "Maximum event queue length: %d\n", pollStats.evq_max); + dcb_printf(dcb, "No. of DCBs with pending events: %d\n", + pollStats.evq_pending); + dcb_printf(dcb, "No. of wakeups with pending queue: %d\n", + pollStats.wake_evqpending); dcb_printf(dcb, "No of poll completions with descriptors\n"); dcb_printf(dcb, "\tNo. of descriptors\tNo. of poll completions.\n"); @@ -1024,6 +1163,7 @@ dShowThreads(DCB *dcb) int i, j, n; char *state; double avg1 = 0.0, avg5 = 0.0, avg15 = 0.0; +double qavg1 = 0.0, qavg5 = 0.0, qavg15 = 0.0; dcb_printf(dcb, "Polling Threads.\n\n"); @@ -1032,8 +1172,12 @@ double avg1 = 0.0, avg5 = 0.0, avg15 = 0.0; /* Average all the samples to get the 15 minute average */ for (i = 0; i < n_avg_samples; i++) + { avg15 += avg_samples[i]; + qavg15 += evqp_samples[i]; + } avg15 = avg15 / n_avg_samples; + qavg15 = qavg15 / n_avg_samples; /* Average the last third of the samples to get the 5 minute average */ n = 5 * 60 / POLL_LOAD_FREQ; @@ -1041,8 +1185,12 @@ double avg1 = 0.0, avg5 = 0.0, avg15 = 0.0; if (i < 0) i += n_avg_samples; for (j = i; j < i + n; j++) + { avg5 += avg_samples[j % n_avg_samples]; + qavg5 += evqp_samples[j % n_avg_samples]; + } avg5 = (3 * avg5) / (n_avg_samples); + qavg5 = (3 * qavg5) / (n_avg_samples); /* Average the last 15th of the samples to get the 1 minute average */ n = 60 / POLL_LOAD_FREQ; @@ -1050,16 +1198,23 @@ double avg1 = 0.0, avg5 = 0.0, avg15 = 0.0; if (i < 0) i += n_avg_samples; for (j = i; j < i + n; j++) + { avg1 += avg_samples[j % n_avg_samples]; + qavg1 += evqp_samples[j % n_avg_samples]; + } avg1 = (15 * avg1) / (n_avg_samples); + qavg1 = (15 * qavg1) / (n_avg_samples); dcb_printf(dcb, "15 Minute Average: %.2f, 5 Minute Average: %.2f, " "1 Minute Average: %.2f\n\n", avg15, avg5, avg1); + dcb_printf(dcb, "Pending event queue length averages:\n"); + dcb_printf(dcb, "15 Minute Average: %.2f, 5 Minute Average: %.2f, " + "1 Minute Average: %.2f\n\n", qavg15, qavg5, qavg1); if (thread_data == NULL) return; - dcb_printf(dcb, " ID | State | # fds | Descriptor | Event\n"); - dcb_printf(dcb, "----+------------+--------+------------------+---------------\n"); + dcb_printf(dcb, " ID | State | # fds | Descriptor | Running | Event\n"); + dcb_printf(dcb, "----+------------+--------+------------------+----------+---------------\n"); for (i = 0; i < n_threads; i++) { switch (thread_data[i].state) @@ -1082,11 +1237,11 @@ double avg1 = 0.0, avg5 = 0.0, avg15 = 0.0; } if (thread_data[i].state != THREAD_PROCESSING) dcb_printf(dcb, - " %2d | %-10s | | |\n", + " %2d | %-10s | | | |\n", i, state); else if (thread_data[i].cur_dcb == NULL) dcb_printf(dcb, - " %2d | %-10s | %6d | |\n", + " %2d | %-10s | %6d | | |\n", i, state, thread_data[i].n_fds); else { @@ -1104,9 +1259,10 @@ double avg1 = 0.0, avg5 = 0.0, avg15 = 0.0; from_heap = true; } dcb_printf(dcb, - " %2d | %-10s | %6d | %-16p | %s\n", + " %2d | %-10s | %6d | %-16p | <%3d00ms | %s\n", i, state, thread_data[i].n_fds, - thread_data[i].cur_dcb, event_string); + thread_data[i].cur_dcb, 1 + hkheartbeat - dcb->evq.started, + event_string); if (from_heap) { @@ -1139,6 +1295,7 @@ int new_samples, new_nfds; else current_avg = 0.0; avg_samples[next_sample] = current_avg; + evqp_samples[next_sample] = pollStats.evq_pending; next_sample++; if (next_sample >= n_avg_samples) next_sample = 0; @@ -1207,4 +1364,136 @@ static void poll_add_event_to_dcb( } } spinlock_release(&pollqlock); -} \ No newline at end of file +} + +/* + * Insert a fake write completion event for a DCB into the polling + * queue. + * + * This is used to trigger transmission activity on another DCB from + * within the event processing routine of a DCB. or to allow a DCB + * to defer some further output processing, to allow for other DCBs + * to receive a slice of the processing time. Fake events are added + * to the tail of the event queue, in the same way that real events + * are, so maintain the "fairness" of processing. + * + * @param dcb DCB to emulate an EPOLLOUT event for + */ +void +poll_fake_write_event(DCB *dcb) +{ +uint32_t ev = EPOLLOUT; + + spinlock_acquire(&pollqlock); + /* + * If the DCB is already on the queue, there are no pending events and + * there are other events on the queue, then + * take it off the queue. This stops the DCB hogging the threads. + */ + if (DCB_POLL_BUSY(dcb) && dcb->evq.pending_events == 0 && dcb->evq.prev != dcb) + { + dcb->evq.prev->evq.next = dcb->evq.next; + dcb->evq.next->evq.prev = dcb->evq.prev; + if (eventq == dcb) + eventq = dcb->evq.next; + dcb->evq.next = NULL; + dcb->evq.prev = NULL; + pollStats.evq_length--; + } + + if (DCB_POLL_BUSY(dcb)) + { + if (dcb->evq.pending_events == 0) + pollStats.evq_pending++; + dcb->evq.pending_events |= ev; + } + else + { + dcb->evq.pending_events = ev; + dcb->evq.inserted = hkheartbeat; + if (eventq) + { + dcb->evq.prev = eventq->evq.prev; + eventq->evq.prev->evq.next = dcb; + eventq->evq.prev = dcb; + dcb->evq.next = eventq; + } + else + { + eventq = dcb; + dcb->evq.prev = dcb; + dcb->evq.next = dcb; + } + pollStats.evq_length++; + pollStats.evq_pending++; + dcb->evq.inserted = hkheartbeat; + if (pollStats.evq_length > pollStats.evq_max) + { + pollStats.evq_max = pollStats.evq_length; + } + } + spinlock_release(&pollqlock); +} + +/** + * Print the event queue contents + * + * @param pdcb The DCB to print the event queue to + */ +void +dShowEventQ(DCB *pdcb) +{ +DCB *dcb; + + spinlock_acquire(&pollqlock); + if (eventq == NULL) + { + /* Nothing to process */ + spinlock_release(&pollqlock); + return; + } + dcb = eventq; + dcb_printf(pdcb, "\nEvent Queue.\n"); + dcb_printf(pdcb, "%-16s | %-10s | %-18s | %s\n", "DCB", "Status", "Processing Events", + "Pending Events"); + dcb_printf(pdcb, "-----------------+------------+--------------------+-------------------\n"); + do { + dcb_printf(pdcb, "%-16p | %-10s | %-18s | %-18s\n", dcb, + dcb->evq.processing ? "Processing" : "Pending", + event_to_string(dcb->evq.processing_events), + event_to_string(dcb->evq.pending_events)); + dcb = dcb->evq.next; + } while (dcb != eventq); + spinlock_release(&pollqlock); +} + + +/** + * Print the event queue statistics + * + * @param pdcb The DCB to print the event queue to + */ +void +dShowEventStats(DCB *pdcb) +{ +int i; + + dcb_printf(pdcb, "\nEvent statistics.\n"); + dcb_printf(pdcb, "Maximum queue time: %3d00ms\n", queueStats.maxqtime); + dcb_printf(pdcb, "Maximum execution time: %3d00ms\n", queueStats.maxexectime); + dcb_printf(pdcb, "Maximum event queue length: %3d\n", pollStats.evq_max); + dcb_printf(pdcb, "Current event queue length: %3d\n", pollStats.evq_length); + dcb_printf(pdcb, "\n"); + dcb_printf(pdcb, " | Number of events\n"); + dcb_printf(pdcb, "Duration | Queued | Executed\n"); + dcb_printf(pdcb, "---------------+------------+-----------\n"); + dcb_printf(pdcb, " < 100ms | %-10d | %-10d\n", + queueStats.qtimes[0], queueStats.exectimes[0]); + for (i = 1; i < N_QUEUE_TIMES; i++) + { + dcb_printf(pdcb, " %2d00 - %2d00ms | %-10d | %-10d\n", i, i + 1, + queueStats.qtimes[i], queueStats.exectimes[i]); + } + dcb_printf(pdcb, " > %2d00ms | %-10d | %-10d\n", N_QUEUE_TIMES, + queueStats.qtimes[N_QUEUE_TIMES], queueStats.exectimes[N_QUEUE_TIMES]); +} diff --git a/server/core/test/CMakeLists.txt b/server/core/test/CMakeLists.txt index 433fe5993..0107f127d 100644 --- a/server/core/test/CMakeLists.txt +++ b/server/core/test/CMakeLists.txt @@ -11,6 +11,7 @@ add_executable(test_service testservice.c) add_executable(test_server testserver.c) add_executable(test_users testusers.c) add_executable(test_adminusers testadminusers.c) +add_executable(testmemlog testmemlog.c) target_link_libraries(test_mysql_users MySQLClient fullcore) target_link_libraries(test_hash fullcore) target_link_libraries(test_hint fullcore) @@ -25,6 +26,7 @@ target_link_libraries(test_server fullcore) target_link_libraries(test_users fullcore) target_link_libraries(test_adminusers fullcore) add_test(testMySQLUsers test_mysql_users) +target_link_libraries(testmemlog fullcore) add_test(TestHash test_hash) add_test(TestHint test_hint) add_test(TestSpinlock test_spinlock) @@ -37,3 +39,4 @@ add_test(TestService test_service) add_test(TestServer test_server) add_test(TestUsers test_users) add_test(TestAdminUsers test_adminusers) +add_test(TestMemlog testmemlog) diff --git a/server/core/test/makefile b/server/core/test/makefile index a657077e2..e3d7a4ce5 100644 --- a/server/core/test/makefile +++ b/server/core/test/makefile @@ -22,7 +22,7 @@ LIBS= -L$(EMBEDDED_LIB) -lmysqld \ -lz -lm -lcrypt -lcrypto -ldl -laio -lrt -pthread -llog_manager \ -L../../inih/extra -linih -lssl -lstdc++ -TESTS=testhash testspinlock testbuffer testmodutil testpoll testservice testdcb testfilter testadminusers +TESTS=testhash testspinlock testbuffer testmodutil testpoll testservice testdcb testfilter testadminusers testmemlog cleantests: - $(DEL) *.o @@ -100,6 +100,13 @@ testadminusers: testadminusers.c libcore.a -I$(ROOT_PATH)/utils \ testadminusers.c libcore.a $(UTILSPATH)/skygw_utils.o $(LIBS) -o testadminusers +testmemlog: testmemlog.c libcore.a + $(CC) $(CFLAGS) $(LDFLAGS) \ + -I$(ROOT_PATH)/server/include \ + -I$(ROOT_PATH)/utils \ + testmemlog.c libcore.a $(UTILSPATH)/skygw_utils.o $(LIBS) -o testmemlog + + libcore.a: ../*.o ar rv libcore.a ../*.o diff --git a/server/core/test/testmemlog.c b/server/core/test/testmemlog.c new file mode 100644 index 000000000..1523ec8ec --- /dev/null +++ b/server/core/test/testmemlog.c @@ -0,0 +1,404 @@ +/* + * This file is distributed as part of MaxScale from MariaDB. It is free + * software: you can redistribute it and/or modify it under the terms of the + * GNU General Public License as published by the Free Software Foundation, + * version 2. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Copyright MariaDB Corporation 2014 + */ + +/** + * + * @verbatim + * Revision History + * + * Date Who Description + * 30/09/2014 Mark Riddoch Initial implementation + * + * @endverbatim + */ + +#include +#include +#include +#include +#include + +/** + * Count the number of lines in a file + * + * @param file The name of the file + * @return -1 if the file could not be opened or the numebr of lines + */ +int +linecount(char *file) +{ +FILE *fp; +int i = 0; +char buffer[180]; + + if ((fp = fopen(file, "r")) == NULL) + return -1; + while (fgets(buffer, 180, fp) != NULL) + i++; + fclose(fp); + return i; +} + +/* Some strings to log */ +char *strings[] = { + "First log entry", + "Second entry", + "Third", + "The fourth thing to log", + "Add a final 5th item" +}; + +int +main() +{ +MEMLOG *log, *log2; +int i; +long j; +long long k; +int failures = 0; + + unlink("memlog1"); + if ((log = memlog_create("memlog1", ML_INT, 100)) == NULL) + { + printf("Memlog Creation: Failed\n"); + failures++; + } + else + { + printf("Memlog Creation: Passed\n"); + if (access("memlog1",R_OK) == 0) + { + printf("File existance 1: Failed\n"); + failures++; + } + else + printf("File existance 1: Passed\n"); + for (i = 0; i < 50; i++) + memlog_log(log, (void *)i); + if (access("memlog1",R_OK) == 0) + { + printf("File existance 2: Failed\n"); + failures++; + } + else + printf("File existance 2: Passed\n"); + for (i = 0; i < 50; i++) + memlog_log(log, (void *)i); + if (access("memlog1",R_OK) != 0) + { + printf("File existance 3: Failed\n"); + failures++; + } + else + printf("File existance 3: Passed\n"); + if (linecount("memlog1") != 100) + { + printf("Incorrect entry count: Failed\n"); + failures++; + } + else + printf("Incorrect entry count: Passed\n"); + for (i = 0; i < 50; i++) + memlog_log(log, (void *)i); + if (linecount("memlog1") != 100) + { + printf("Premature Flushing: Failed\n"); + failures++; + } + else + printf("Premature Flushing: Passed\n"); + memlog_destroy(log); + if (linecount("memlog1") != 150) + { + printf("Flush on destroy: Failed\n"); + failures++; + } + else + printf("Flush on destroy: Passed\n"); + } + + unlink("memlog2"); + if ((log = memlog_create("memlog2", ML_LONG, 100)) == NULL) + { + printf("Memlog Creation: Failed\n"); + failures++; + } + else + { + printf("Memlog Creation: Passed\n"); + if (access("memlog2",R_OK) == 0) + { + printf("File existance 1: Failed\n"); + failures++; + } + else + printf("File existance 1: Passed\n"); + for (j = 0; j < 50; j++) + memlog_log(log, (void *)j); + if (access("memlog2",R_OK) == 0) + { + printf("File existance 2: Failed\n"); + failures++; + } + else + printf("File existance 2: Passed\n"); + for (j = 0; j < 50; j++) + memlog_log(log, (void *)j); + if (access("memlog2",R_OK) != 0) + { + printf("File existance 3: Failed\n"); + failures++; + } + else + printf("File existance 3: Passed\n"); + if (linecount("memlog2") != 100) + { + printf("Incorrect entry count: Failed\n"); + failures++; + } + else + printf("Incorrect entry count: Passed\n"); + for (j = 0; j < 50; j++) + memlog_log(log, (void *)j); + if (linecount("memlog2") != 100) + { + printf("Premature Flushing: Failed\n"); + failures++; + } + else + printf("Premature Flushing: Passed\n"); + memlog_destroy(log); + if (linecount("memlog2") != 150) + { + printf("Flush on destroy: Failed\n"); + failures++; + } + else + printf("Flush on destroy: Passed\n"); + } + + unlink("memlog3"); + if ((log = memlog_create("memlog3", ML_LONGLONG, 100)) == NULL) + { + printf("Memlog Creation: Failed\n"); + failures++; + } + else + { + printf("Memlog Creation: Passed\n"); + if (access("memlog3",R_OK) == 0) + { + printf("File existance 1: Failed\n"); + failures++; + } + else + printf("File existance 1: Passed\n"); + for (k = 0; k < 50; k++) + memlog_log(log, (void *)k); + if (access("memlog3",R_OK) == 0) + { + printf("File existance 2: Failed\n"); + failures++; + } + else + printf("File existance 2: Passed\n"); + for (k = 0; k < 50; k++) + memlog_log(log, (void *)k); + if (access("memlog3",R_OK) != 0) + { + printf("File existance 3: Failed\n"); + failures++; + } + else + printf("File existance 3: Passed\n"); + if (linecount("memlog3") != 100) + { + printf("Incorrect entry count: Failed\n"); + failures++; + } + else + printf("Incorrect entry count: Passed\n"); + for (k = 0; k < 50; k++) + memlog_log(log, (void *)k); + if (linecount("memlog3") != 100) + { + printf("Premature Flushing: Failed\n"); + failures++; + } + else + printf("Premature Flushing: Passed\n"); + memlog_destroy(log); + if (linecount("memlog3") != 150) + { + printf("Flush on destroy: Failed\n"); + failures++; + } + else + printf("Flush on destroy: Passed\n"); + } + + unlink("memlog4"); + if ((log = memlog_create("memlog4", ML_STRING, 100)) == NULL) + { + printf("Memlog Creation: Failed\n"); + failures++; + } + else + { + printf("Memlog Creation: Passed\n"); + if (access("memlog4",R_OK) == 0) + { + printf("File existance 1: Failed\n"); + failures++; + } + else + printf("File existance 1: Passed\n"); + for (i = 0; i < 50; i++) + memlog_log(log, strings[i%5]); + if (access("memlog4",R_OK) == 0) + { + printf("File existance 2: Failed\n"); + failures++; + } + else + printf("File existance 2: Passed\n"); + for (i = 0; i < 50; i++) + memlog_log(log, strings[i%5]); + if (access("memlog4",R_OK) != 0) + { + printf("File existance 3: Failed\n"); + failures++; + } + else + printf("File existance 3: Passed\n"); + if (linecount("memlog4") != 100) + { + printf("Incorrect entry count: Failed\n"); + failures++; + } + else + printf("Incorrect entry count: Passed\n"); + for (i = 0; i < 50; i++) + memlog_log(log, strings[i%5]); + if (linecount("memlog4") != 100) + { + printf("Premature Flushing: Failed\n"); + failures++; + } + else + printf("Premature Flushing: Passed\n"); + memlog_destroy(log); + if (linecount("memlog4") != 150) + { + printf("Flush on destroy: Failed\n"); + failures++; + } + else + printf("Flush on destroy: Passed\n"); + } + + unlink("memlog5"); + unlink("memlog6"); + if ((log = memlog_create("memlog5", ML_INT, 100)) == NULL) + { + printf("Memlog Creation: Failed\n"); + failures++; + } + else + { + printf("Memlog Creation: Passed\n"); + if ((log2 = memlog_create("memlog6", ML_INT, 100)) == NULL) + { + printf("Memlog Creation: Failed\n"); + failures++; + } + else + { + printf("Memlog Creation: Passed\n"); + for (i = 0; i < 40; i++) + memlog_log(log, (void *)i); + for (i = 0; i < 30; i++) + memlog_log(log2, (void *)i); + memlog_flush_all(); + if (linecount("memlog5") != 40 || + linecount("memlog6") != 30) + { + printf( + "Memlog flush all: Failed\n"); + failures++; + } + else + printf( + "Memlog flush all: Passed\n"); + } + } + + unlink("memlog7"); + if ((log = memlog_create("memlog7", ML_INT, 100)) == NULL) + { + printf("Memlog Creation: Failed\n"); + failures++; + } + else + { + printf("Memlog Creation: Passed\n"); + if (access("memlog7",R_OK) == 0) + { + printf("File existance 1: Failed\n"); + failures++; + } + else + printf("File existance 1: Passed\n"); + for (i = 0; i < 5050; i++) + memlog_log(log, (void *)i); + if (access("memlog7",R_OK) != 0) + { + printf("File existance 3: Failed\n"); + failures++; + } + else + printf("File existance 3: Passed\n"); + if (linecount("memlog7") != 5000) + { + printf("Incorrect entry count: Failed\n"); + failures++; + } + else + printf("Incorrect entry count: Passed\n"); + for (i = 0; i < 50; i++) + memlog_log(log, (void *)i); + if (linecount("memlog7") != 5100) + { + printf("Residual flushing: Failed\n"); + failures++; + } + else + printf("Premature Flushing: Passed\n"); + for (i = 0; i < 10120; i++) + memlog_log(log, (void *)i); + memlog_destroy(log); + if (linecount("memlog7") != 15220) + { + printf("Flush on destroy: Failed\n"); + failures++; + } + else + printf("Flush on destroy: Passed\n"); + } + exit(failures); +} diff --git a/server/include/config.h b/server/include/config.h index 0582bc50e..ca3092576 100644 --- a/server/include/config.h +++ b/server/include/config.h @@ -29,10 +29,13 @@ * 21/06/13 Mark Riddoch Initial implementation * 07/05/14 Massimiliano Pinto Added version_string to global configuration * 23/05/14 Massimiliano Pinto Added id to global configuration + * 17/10/14 Mark Riddoch Added poll tuning configuration parameters * * @endverbatim */ +#define DEFAULT_NBPOLLS 3 /**< Default number of non block polls before we block */ +#define DEFAULT_POLLSLEEP 1000 /**< Default poll wait time (milliseconds) */ /** * Maximum length for configuration parameter value. */ @@ -92,11 +95,15 @@ typedef struct { int n_threads; /**< Number of polling threads */ char *version_string; /**< The version string of embedded database library */ unsigned long id; /**< MaxScale ID */ + unsigned int n_nbpoll; /**< Tune number of non-blocking polls */ + unsigned int pollsleep; /**< Wait time in blocking polls */ } GATEWAY_CONF; extern int config_load(char *); extern int config_reload(); extern int config_threadcount(); +extern unsigned int config_nbpolls(); +extern unsigned int config_pollsleep(); CONFIG_PARAMETER* config_get_param(CONFIG_PARAMETER* params, const char* name); config_param_type_t config_get_paramtype(CONFIG_PARAMETER* param); CONFIG_PARAMETER* config_clone_param(CONFIG_PARAMETER* param); diff --git a/server/include/dcb.h b/server/include/dcb.h index 66d18bcf7..d9cdeb05a 100644 --- a/server/include/dcb.h +++ b/server/include/dcb.h @@ -98,12 +98,28 @@ typedef struct gw_protocol { int (*session)(struct dcb *, void *); } GWPROTOCOL; +/** + * The event queue structure used in the polling loop to maintain a queue + * of events that need to be processed for the DCB. + * + * next The next DCB in the event queue + * prev The previous DCB in the event queue + * pending_events The events that are pending processing + * processing_events The evets currently being processed + * processing Flag to indicate the processing status of the DCB + * eventqlock Spinlock to protect this structure + * inserted Insertion time for logging purposes + * started Time that the processign started + */ typedef struct { struct dcb *next; struct dcb *prev; uint32_t pending_events; + uint32_t processing_events; int processing; SPINLOCK eventqlock; + unsigned long inserted; + unsigned long started; } DCBEVENTQ; /** diff --git a/server/include/housekeeper.h b/server/include/housekeeper.h index 14523747d..0379ff23a 100644 --- a/server/include/housekeeper.h +++ b/server/include/housekeeper.h @@ -18,6 +18,7 @@ * Copyright MariaDB Corporation Ab 2014 */ #include +#include /** * @file housekeeper.h A mechanism to have task run periodically @@ -31,6 +32,11 @@ * @endverbatim */ +typedef enum { + HK_REPEATED = 1, + HK_ONESHOT +} HKTASK_TYPE; + /** * The housekeeper task list */ @@ -40,12 +46,22 @@ typedef struct hktask { void *data; /*< Data to pass the task */ int frequency; /*< How often to call the tasks (seconds) */ time_t nextdue; /*< When the task should be next run */ + HKTASK_TYPE + type; /*< The task type */ struct hktask *next; /*< Next task in the list */ } HKTASK; +/** + * The global housekeeper heartbeat value. This value is increamente + * every 100ms and may be used for crude timing etc. + */ +extern unsigned long hkheartbeat; + extern void hkinit(); extern int hktask_add(char *name, void (*task)(void *), void *data, int frequency); +extern int hktask_oneshot(char *name, void (*task)(void *), void *data, int when); extern int hktask_remove(char *name); extern void hkshutdown(); +extern void hkshow_tasks(DCB *pdcb); #endif diff --git a/server/include/memlog.h b/server/include/memlog.h new file mode 100644 index 000000000..de183a48d --- /dev/null +++ b/server/include/memlog.h @@ -0,0 +1,65 @@ +#ifndef _MEMLOG_H +#define _MEMLOG_H +/* + * This file is distributed as part of MariaDB MaxScale. It is free + * software: you can redistribute it and/or modify it under the terms of the + * GNU General Public License as published by the Free Software Foundation, + * version 2. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Copyright MariaDB Ab 2014 + */ + +/** + * @file memlog.h The memory logging mechanism + * + * @verbatim + * Revision History + * + * Date Who Description + * 26/09/14 Mark Riddoch Initial implementation + * + * @endverbatim + */ +#include + +typedef enum { ML_INT, ML_LONG, ML_LONGLONG, ML_STRING } MEMLOGTYPE; + +typedef struct memlog { + char *name; + SPINLOCK lock; + void *values; + int offset; + int size; + MEMLOGTYPE type; + unsigned int flags; + unsigned int iflags; + struct memlog *next; +} MEMLOG; + +/* + * MEMLOG flag bits + */ +#define MLNOAUTOFLUSH 0x0001 + +/* + * MEMLOG internal flags + */ +#define MLWRAPPED 0x0001 + + +extern MEMLOG *memlog_create(char *, MEMLOGTYPE, int); +extern void memlog_destroy(MEMLOG *); +extern void memlog_set(MEMLOG *, unsigned int); +extern void memlog_log(MEMLOG *, void *); +extern void memlog_flush_all(); +extern void memlog_flush(MEMLOG *); +#endif diff --git a/server/include/modules.h b/server/include/modules.h index 904e6cc7a..adda2b255 100644 --- a/server/include/modules.h +++ b/server/include/modules.h @@ -32,6 +32,8 @@ * 13/06/13 Mark Riddoch Initial implementation * 08/07/13 Mark Riddoch Addition of monitor modules * 29/05/14 Mark Riddoch Addition of filter modules + * 01/10/14 Mark Riddoch Addition of call to unload all modules on + * shutdown * @endverbatim */ @@ -58,6 +60,7 @@ typedef struct modules { extern void *load_module(const char *module, const char *type); extern void unload_module(const char *module); +extern void unload_all_modules(); extern void printModules(); extern void dprintAllModules(DCB *); char* get_maxscale_home(void); diff --git a/server/include/poll.h b/server/include/poll.h index bc8b8ccc9..24bf0645d 100644 --- a/server/include/poll.h +++ b/server/include/poll.h @@ -32,7 +32,6 @@ * @endverbatim */ #define MAX_EVENTS 1000 -#define EPOLL_TIMEOUT 1000 /**< The epoll timeout in milliseconds */ extern void poll_init(); extern int poll_add_dcb(DCB *); @@ -40,8 +39,11 @@ extern int poll_remove_dcb(DCB *); extern void poll_waitevents(void *); extern void poll_shutdown(); extern GWBITMASK *poll_bitmask(); +extern void poll_set_maxwait(unsigned int); +extern void poll_set_nonblocking_polls(unsigned int); extern void dprintPollStats(DCB *); extern void dShowThreads(DCB *dcb); void poll_add_epollin_event_to_dcb(DCB* dcb, GWBUF* buf); - +extern void dShowEventQ(DCB *dcb); +extern void dShowEventStats(DCB *dcb); #endif diff --git a/server/inih/._LICENSE.txt b/server/inih/._LICENSE.txt deleted file mode 100755 index 7fa6eb8a6..000000000 Binary files a/server/inih/._LICENSE.txt and /dev/null differ diff --git a/server/inih/._README.txt b/server/inih/._README.txt deleted file mode 100755 index 7fa6eb8a6..000000000 Binary files a/server/inih/._README.txt and /dev/null differ diff --git a/server/inih/._cpp b/server/inih/._cpp deleted file mode 100755 index 17b8574a4..000000000 Binary files a/server/inih/._cpp and /dev/null differ diff --git a/server/inih/._examples b/server/inih/._examples deleted file mode 100755 index 17b8574a4..000000000 Binary files a/server/inih/._examples and /dev/null differ diff --git a/server/inih/._extra b/server/inih/._extra deleted file mode 100755 index 17b8574a4..000000000 Binary files a/server/inih/._extra and /dev/null differ diff --git a/server/inih/._ini.c b/server/inih/._ini.c deleted file mode 100755 index 17b8574a4..000000000 Binary files a/server/inih/._ini.c and /dev/null differ diff --git a/server/inih/._ini.h b/server/inih/._ini.h deleted file mode 100755 index 17b8574a4..000000000 Binary files a/server/inih/._ini.h and /dev/null differ diff --git a/server/inih/._tests b/server/inih/._tests deleted file mode 100755 index 7fa6eb8a6..000000000 Binary files a/server/inih/._tests and /dev/null differ diff --git a/server/inih/cpp/._INIReader.cpp b/server/inih/cpp/._INIReader.cpp deleted file mode 100755 index 17b8574a4..000000000 Binary files a/server/inih/cpp/._INIReader.cpp and /dev/null differ diff --git a/server/inih/cpp/._INIReader.h b/server/inih/cpp/._INIReader.h deleted file mode 100755 index 17b8574a4..000000000 Binary files a/server/inih/cpp/._INIReader.h and /dev/null differ diff --git a/server/inih/cpp/._INIReaderTest.cpp b/server/inih/cpp/._INIReaderTest.cpp deleted file mode 100755 index 17b8574a4..000000000 Binary files a/server/inih/cpp/._INIReaderTest.cpp and /dev/null differ diff --git a/server/inih/examples/._config.def b/server/inih/examples/._config.def deleted file mode 100755 index 17b8574a4..000000000 Binary files a/server/inih/examples/._config.def and /dev/null differ diff --git a/server/inih/examples/._ini_dump.c b/server/inih/examples/._ini_dump.c deleted file mode 100755 index 17b8574a4..000000000 Binary files a/server/inih/examples/._ini_dump.c and /dev/null differ diff --git a/server/inih/examples/._ini_example.c b/server/inih/examples/._ini_example.c deleted file mode 100755 index 17b8574a4..000000000 Binary files a/server/inih/examples/._ini_example.c and /dev/null differ diff --git a/server/inih/examples/._ini_xmacros.c b/server/inih/examples/._ini_xmacros.c deleted file mode 100755 index 17b8574a4..000000000 Binary files a/server/inih/examples/._ini_xmacros.c and /dev/null differ diff --git a/server/inih/examples/._test.ini b/server/inih/examples/._test.ini deleted file mode 100755 index 17b8574a4..000000000 Binary files a/server/inih/examples/._test.ini and /dev/null differ diff --git a/server/inih/extra/._Makefile.static b/server/inih/extra/._Makefile.static deleted file mode 100755 index 17b8574a4..000000000 Binary files a/server/inih/extra/._Makefile.static and /dev/null differ diff --git a/server/inih/tests/._bad_comment.ini b/server/inih/tests/._bad_comment.ini deleted file mode 100755 index 7fa6eb8a6..000000000 Binary files a/server/inih/tests/._bad_comment.ini and /dev/null differ diff --git a/server/inih/tests/._bad_multi.ini b/server/inih/tests/._bad_multi.ini deleted file mode 100755 index 7fa6eb8a6..000000000 Binary files a/server/inih/tests/._bad_multi.ini and /dev/null differ diff --git a/server/inih/tests/._bad_section.ini b/server/inih/tests/._bad_section.ini deleted file mode 100755 index 7fa6eb8a6..000000000 Binary files a/server/inih/tests/._bad_section.ini and /dev/null differ diff --git a/server/inih/tests/._baseline_multi.txt b/server/inih/tests/._baseline_multi.txt deleted file mode 100755 index 7fa6eb8a6..000000000 Binary files a/server/inih/tests/._baseline_multi.txt and /dev/null differ diff --git a/server/inih/tests/._baseline_single.txt b/server/inih/tests/._baseline_single.txt deleted file mode 100755 index 7fa6eb8a6..000000000 Binary files a/server/inih/tests/._baseline_single.txt and /dev/null differ diff --git a/server/inih/tests/._bom.ini b/server/inih/tests/._bom.ini deleted file mode 100755 index 7fa6eb8a6..000000000 Binary files a/server/inih/tests/._bom.ini and /dev/null differ diff --git a/server/inih/tests/._multi_line.ini b/server/inih/tests/._multi_line.ini deleted file mode 100755 index 7fa6eb8a6..000000000 Binary files a/server/inih/tests/._multi_line.ini and /dev/null differ diff --git a/server/inih/tests/._normal.ini b/server/inih/tests/._normal.ini deleted file mode 100755 index 7fa6eb8a6..000000000 Binary files a/server/inih/tests/._normal.ini and /dev/null differ diff --git a/server/inih/tests/._unittest.bat b/server/inih/tests/._unittest.bat deleted file mode 100755 index 7fa6eb8a6..000000000 Binary files a/server/inih/tests/._unittest.bat and /dev/null differ diff --git a/server/inih/tests/._unittest.c b/server/inih/tests/._unittest.c deleted file mode 100755 index 7fa6eb8a6..000000000 Binary files a/server/inih/tests/._unittest.c and /dev/null differ diff --git a/server/inih/tests/._user_error.ini b/server/inih/tests/._user_error.ini deleted file mode 100755 index 7fa6eb8a6..000000000 Binary files a/server/inih/tests/._user_error.ini and /dev/null differ diff --git a/server/modules/include/blr.h b/server/modules/include/blr.h index 70be1f579..282bcdcdf 100644 --- a/server/modules/include/blr.h +++ b/server/modules/include/blr.h @@ -33,19 +33,40 @@ #include #include +#include + #define BINLOG_FNAMELEN 16 #define BLR_PROTOCOL "MySQLBackend" #define BINLOG_MAGIC { 0xfe, 0x62, 0x69, 0x6e } #define BINLOG_NAMEFMT "%s.%06d" #define BINLOG_NAME_ROOT "mysql-bin" +/* How often to call the binlog status function (seconds) */ +#define BLR_STATS_FREQ 60 +#define BLR_NSTATS_MINUTES 30 + /** * High and Low water marks for the slave dcb. These values can be overriden * by the router options highwater and lowwater. */ -#define DEF_LOW_WATER 2000 -#define DEF_HIGH_WATER 30000 +#define DEF_LOW_WATER 1000 +#define DEF_HIGH_WATER 10000 +/** + * Default burst sizes for slave catchup + */ +#define DEF_SHORT_BURST 15 +#define DEF_LONG_BURST 500 +#define DEF_BURST_SIZE 1024000 /* 1 Mb */ + +/** + * master reconnect backoff constants + * BLR_MASTER_BACKOFF_TIME The increments of the back off time (seconds) + * BLR_MAX_BACKOFF Maximum number of increments to backoff to + */ + +#define BLR_MASTER_BACKOFF_TIME 5 +#define BLR_MAX_BACKOFF 60 /** * Some useful macros for examining the MySQL Response packets */ @@ -56,24 +77,71 @@ #define MYSQL_ERROR_MSG(buf) ((uint8_t *)GWBUF_DATA(buf) + 6) #define MYSQL_COMMAND(buf) (*((uint8_t *)GWBUF_DATA(buf) + 4)) + +/** + * Packet header for replication messages + */ +typedef struct rep_header { + int payload_len; /*< Payload length (24 bits) */ + uint8_t seqno; /*< Response sequence number */ + uint8_t ok; /*< OK Byte from packet */ + uint32_t timestamp; /*< Timestamp - start of binlog record */ + uint8_t event_type; /*< Binlog event type */ + uint32_t serverid; /*< Server id of master */ + uint32_t event_size; /*< Size of header, post-header and body */ + uint32_t next_pos; /*< Position of next event */ + uint16_t flags; /*< Event flags */ +} REP_HEADER; + +/** + * The binlog record structure. This contains the actual packet read from the binlog + * file. + */ +typedef struct { + unsigned long position; /*< binlog record position for this cache entry */ + GWBUF *pkt; /*< The packet received from the master */ + REP_HEADER hdr; /*< The packet header */ +} BLCACHE_RECORD; + +/** + * The binlog cache. A cache exists for each file that hold cached bin log records. + * Caches will be used for all files being read by more than 1 slave. + */ +typedef struct { + BLCACHE_RECORD **records; /*< The actual binlog records */ + int current; /*< The next record that will be inserted */ + int cnt; /*< The number of records in the cache */ + SPINLOCK lock; /*< The spinlock for the cache */ +} BLCACHE; + +typedef struct blfile { + char binlogname[BINLOG_FNAMELEN+1]; /*< Name of the binlog file */ + int fd; /*< Actual file descriptor */ + int refcnt; /*< Reference count for file */ + BLCACHE *cache; /*< Record cache for this file */ + SPINLOCK lock; /*< The file lock */ + struct blfile *next; /*< Next file in list */ +} BLFILE; + /** * Slave statistics */ typedef struct { - int n_events; /*< Number of events sent */ - int n_bursts; /*< Number of bursts sent */ - int n_requests; /*< Number of requests received */ - int n_flows; /*< Number of flow control restarts */ - int n_catchupnr; /*< No. of times catchup resulted in not entering loop */ - int n_alreadyupd; - int n_upd; - int n_cb; - int n_cbna; - int n_dcb; - int n_above; - int n_failed_read; - int n_overrun; - int n_actions[3]; + int n_events; /*< Number of events sent */ + int n_bursts; /*< Number of bursts sent */ + int n_requests; /*< Number of requests received */ + int n_flows; /*< Number of flow control restarts */ + int n_upd; + int n_cb; + int n_cbna; + int n_dcb; + int n_above; + int n_failed_read; + int n_overrun; + int n_actions[3]; + uint64_t lastsample; + int minno; + int minavgs[BLR_NSTATS_MINUTES]; } SLAVE_STATS; /** @@ -89,6 +157,7 @@ typedef struct router_slave { int binlog_pos; /*< Binlog position for this slave */ char binlogfile[BINLOG_FNAMELEN+1]; /*< Current binlog file for this slave */ + BLFILE *file; /*< Currently open binlog file */ int serverid; /*< Server-id of the slave */ char *hostname; /*< Hostname of the slave, if known */ char *user; /*< Username if given */ @@ -132,6 +201,9 @@ typedef struct { uint64_t n_fakeevents; /*< Fake events not written to disk */ uint64_t n_artificial; /*< Artificial events not written to disk */ uint64_t events[0x24]; /*< Per event counters */ + uint64_t lastsample; + int minno; + int minavgs[BLR_NSTATS_MINUTES]; } ROUTER_STATS; /** @@ -153,37 +225,6 @@ typedef struct { int fde_len; /*< Length of fde_event */ } MASTER_RESPONSES; -/** - * The binlog record structure. This contains the actual packet received from the - * master, the binlog position of the data in the packet, a point to the data and - * the length of the binlog record. - * - * This allows requests for binlog records in the cache to be serviced by simply - * sending the exact same packet as was received by MaxScale from the master. - * Items are written to the backing file as soon as they are received. The binlog - * cache is flushed of old records periodically, releasing the GWBUF's back to the - * free memory pool. - */ -typedef struct { - unsigned long position; /*< binlog record position for this cache entry */ - GWBUF *pkt; /*< The packet received from the master */ - unsigned char *data; /*< Pointer to the data within the packet */ - unsigned int record_len; /*< Binlog record length */ -} BLCACHE_RECORD; - -/** - * The binlog cache. A cache exists for each file that hold cached bin log records. - * Typically the router will hold two binlog caches, one for the current file and one - * for the previous file. - */ -typedef struct { - char filename[BINLOG_FNAMELEN+1]; - BLCACHE_RECORD *first; - BLCACHE_RECORD *current; - int cnt; -} BLCACHE; - - /** * The per instance data for the router. */ @@ -205,6 +246,8 @@ typedef struct router_instance { uint8_t lastEventReceived; GWBUF *residual; /*< Any residual binlog event */ MASTER_RESPONSES saved_master; /*< Saved master responses */ + char *binlogdir; /*< The directory with the binlog files */ + SPINLOCK binlog_lock; /*< Lock to control update of the binlog position */ char binlog_name[BINLOG_FNAMELEN+1]; /*< Name of the current binlog file */ uint64_t binlog_position; @@ -212,32 +255,25 @@ typedef struct router_instance { int binlog_fd; /*< File descriptor of the binlog * file being written */ + uint64_t last_written; /*< Position of last event written */ + char prevbinlog[BINLOG_FNAMELEN+1]; + int rotating; /*< Rotation in progress flag */ + BLFILE *files; /*< Files used by the slaves */ + SPINLOCK fileslock; /*< Lock for the files queue above */ unsigned int low_water; /*< Low water mark for client DCB */ unsigned int high_water; /*< High water mark for client DCB */ - BLCACHE *cache[2]; + unsigned int short_burst; /*< Short burst for slave catchup */ + unsigned int long_burst; /*< Long burst for slave catchup */ + unsigned long burst_size; /*< Maximum size of burst to send */ ROUTER_STATS stats; /*< Statistics for this router */ int active_logs; int reconnect_pending; + int retry_backoff; int handling_threads; struct router_instance *next; } ROUTER_INSTANCE; -/** - * Packet header for replication messages - */ -typedef struct rep_header { - int payload_len; /*< Payload length (24 bits) */ - uint8_t seqno; /*< Response sequence number */ - uint8_t ok; /*< OK Byte from packet */ - uint32_t timestamp; /*< Timestamp - start of binlog record */ - uint8_t event_type; /*< Binlog event type */ - uint32_t serverid; /*< Server id of master */ - uint32_t event_size; /*< Size of header, post-header and body */ - uint32_t next_pos; /*< Position of next event */ - uint16_t flags; /*< Event flags */ -} REP_HEADER; - /** * State machine for the master to MaxScale replication */ @@ -270,21 +306,23 @@ static char *blrm_states[] = { "Unconnected", "Authenticated", "Timestamp retrie #define BLRS_UNREGISTERED 0x0001 #define BLRS_REGISTERED 0x0002 #define BLRS_DUMPING 0x0003 +#define BLRS_ERRORED 0x0004 -#define BLRS_MAXSTATE 0x0003 +#define BLRS_MAXSTATE 0x0004 static char *blrs_states[] = { "Created", "Unregistered", "Registered", - "Sending binlogs" }; + "Sending binlogs", "Errored" }; /** * Slave catch-up status */ -#define CS_READING 0x0001 -#define CS_INNERLOOP 0x0002 #define CS_UPTODATE 0x0004 #define CS_EXPECTCB 0x0008 #define CS_DIST 0x0010 #define CS_DISTLATCH 0x0020 +#define CS_THRDWAIT 0x0040 +#define CS_BUSY 0x0100 +#define CS_HOLD 0x0200 /** * MySQL protocol OpCodes needed for replication @@ -347,22 +385,45 @@ static char *blrs_states[] = { "Created", "Unregistered", "Registered", #define LOG_EVENT_NO_FILTER_F 0x0100 #define LOG_EVENT_MTS_ISOLATE_F 0x0200 +/** + * Macros to extract common fields + */ +#define INLINE_EXTRACT 1 /* Set to 0 for debug purposes */ + +#if INLINE_EXTRACT +#define EXTRACT16(x) (*(uint8_t *)(x) | (*((uint8_t *)(x) + 1) << 8)) +#define EXTRACT24(x) (*(uint8_t *)(x) | \ + (*((uint8_t *)(x) + 1) << 8) | \ + (*((uint8_t *)(x) + 2) << 16)) +#define EXTRACT32(x) (*(uint8_t *)(x) | \ + (*((uint8_t *)(x) + 1) << 8) | \ + (*((uint8_t *)(x) + 2) << 16) | \ + (*((uint8_t *)(x) + 3) << 24)) +#else +#define EXTRACT16(x) extract_field((x), 16) +#define EXTRACT24(x) extract_field((x), 24) +#define EXTRACT32(x) extract_field((x), 32) +#endif + /* * Externals within the router */ extern void blr_start_master(ROUTER_INSTANCE *); extern void blr_master_response(ROUTER_INSTANCE *, GWBUF *); extern void blr_master_reconnect(ROUTER_INSTANCE *); +extern int blr_master_connected(ROUTER_INSTANCE *); extern int blr_slave_request(ROUTER_INSTANCE *, ROUTER_SLAVE *, GWBUF *); extern void blr_slave_rotate(ROUTER_SLAVE *slave, uint8_t *ptr); -extern int blr_slave_catchup(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave); +extern int blr_slave_catchup(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave, bool large); extern void blr_init_cache(ROUTER_INSTANCE *); extern void blr_file_init(ROUTER_INSTANCE *); -extern int blr_open_binlog(ROUTER_INSTANCE *, char *); extern void blr_write_binlog_record(ROUTER_INSTANCE *, REP_HEADER *,uint8_t *); extern void blr_file_rotate(ROUTER_INSTANCE *, char *, uint64_t); extern void blr_file_flush(ROUTER_INSTANCE *); -extern GWBUF *blr_read_binlog(int, unsigned int, REP_HEADER *); +extern BLFILE *blr_open_binlog(ROUTER_INSTANCE *, char *); +extern GWBUF *blr_read_binlog(ROUTER_INSTANCE *, BLFILE *, unsigned int, REP_HEADER *); +extern void blr_close_binlog(ROUTER_INSTANCE *, BLFILE *); +extern unsigned long blr_file_size(BLFILE *); #endif diff --git a/server/modules/protocol/mysql_backend.c b/server/modules/protocol/mysql_backend.c index fe6c47ea7..0dcec1b70 100644 --- a/server/modules/protocol/mysql_backend.c +++ b/server/modules/protocol/mysql_backend.c @@ -823,6 +823,19 @@ static int gw_error_backend_event(DCB *dcb) */ if (dcb->state != DCB_STATE_POLLING) { + int error, len; + char buf[100]; + + len = sizeof(error); + if (getsockopt(dcb->fd, SOL_SOCKET, SO_ERROR, &error, &len) == 0) + { + strerror_r(error, buf, 100); + LOGIF(LE, (skygw_log_write_flush( + LOGFILE_ERROR, + "DCB in state %s got error '%s'.", + gw_dcb_state2string(dcb->state), + buf))); + } return 1; } errbuf = mysql_create_custom_error( @@ -849,6 +862,18 @@ static int gw_error_backend_event(DCB *dcb) if (ses_state != SESSION_STATE_ROUTER_READY) { + int error, len; + char buf[100]; + + len = sizeof(error); + if (getsockopt(dcb->fd, SOL_SOCKET, SO_ERROR, &error, &len) == 0) + { + strerror_r(error, buf, 100); + LOGIF(LE, (skygw_log_write_flush( + LOGFILE_ERROR, + "Error '%s' in session that is not ready for routing.", + buf))); + } gwbuf_free(errbuf); goto retblock; } @@ -1040,6 +1065,19 @@ gw_backend_hangup(DCB *dcb) if (ses_state != SESSION_STATE_ROUTER_READY) { + int error, len; + char buf[100]; + + len = sizeof(error); + if (getsockopt(dcb->fd, SOL_SOCKET, SO_ERROR, &error, &len) == 0) + { + strerror_r(error, buf, 100); + LOGIF(LE, (skygw_log_write_flush( + LOGFILE_ERROR, + "Hangup in session that is not ready for routing, " + "Error reported is '%s'.", + buf))); + } gwbuf_free(errbuf); goto retblock; } diff --git a/server/modules/routing/binlog/CMakeLists.txt b/server/modules/routing/binlog/CMakeLists.txt index 071901b70..4de2a35b4 100644 --- a/server/modules/routing/binlog/CMakeLists.txt +++ b/server/modules/routing/binlog/CMakeLists.txt @@ -1,4 +1,4 @@ add_library(binlogrouter SHARED blr.c blr_master.c blr_cache.c blr_slave.c blr_file.c) set_target_properties(binlogrouter PROPERTIES INSTALL_RPATH ${CMAKE_INSTALL_RPATH}:${CMAKE_INSTALL_PREFIX}/lib) -target_link_libraries(binlogrouter ssl pthread log_manager ${EMBEDDED_LIB}) +target_link_libraries(binlogrouter ssl pthread log_manager) install(TARGETS binlogrouter DESTINATION modules) diff --git a/server/modules/routing/binlog/Makefile b/server/modules/routing/binlog/Makefile index 991e86ccd..0e7c0a778 100644 --- a/server/modules/routing/binlog/Makefile +++ b/server/modules/routing/binlog/Makefile @@ -31,10 +31,14 @@ CFLAGS=-c -fPIC -I/usr/include -I../../include -I../../../include \ include ../../../../makefile.inc +#LDFLAGS=-shared -L$(LOGPATH) -L$(QCLASSPATH) -L$(EMBEDDED_LIB) \ +# -Wl,-rpath,$(DEST)/lib \ +# -Wl,-rpath,$(LOGPATH) -Wl,-rpath,$(UTILSPATH) -Wl,-rpath,$(QCLASSPATH) \ +# -Wl,-rpath,$(EMBEDDED_LIB) LDFLAGS=-shared -L$(LOGPATH) -L$(QCLASSPATH) -L$(EMBEDDED_LIB) \ -Wl,-rpath,$(DEST)/lib \ - -Wl,-rpath,$(LOGPATH) -Wl,-rpath,$(UTILSPATH) -Wl,-rpath,$(QCLASSPATH) \ - -Wl,-rpath,$(EMBEDDED_LIB) + -Wl,-rpath,$(LOGPATH) -Wl,-rpath,$(UTILSPATH) + SRCS=blr.c blr_master.c blr_cache.c blr_slave.c blr_file.c OBJ=$(SRCS:.c=.o) diff --git a/server/modules/routing/binlog/blr.c b/server/modules/routing/binlog/blr.c index 5f83eb5f0..bded77735 100644 --- a/server/modules/routing/binlog/blr.c +++ b/server/modules/routing/binlog/blr.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -48,6 +49,7 @@ #include #include #include +#include #include #include @@ -95,6 +97,8 @@ static ROUTER_OBJECT MyObject = { getCapabilities }; +static void stats_func(void *); + static bool rses_begin_locked_router_action(ROUTER_SLAVE *); static void rses_end_locked_router_action(ROUTER_SLAVE *); @@ -157,7 +161,7 @@ static ROUTER * createInstance(SERVICE *service, char **options) { ROUTER_INSTANCE *inst; -char *value; +char *value, *name; int i; if ((inst = calloc(1, sizeof(ROUTER_INSTANCE))) == NULL) { @@ -169,10 +173,19 @@ int i; inst->service = service; spinlock_init(&inst->lock); + inst->files = NULL; + spinlock_init(&inst->fileslock); + spinlock_init(&inst->binlog_lock); + + inst->binlog_fd = -1; inst->low_water = DEF_LOW_WATER; inst->high_water = DEF_HIGH_WATER; inst->initbinlog = 0; + inst->short_burst = DEF_SHORT_BURST; + inst->long_burst = DEF_LONG_BURST; + inst->burst_size = DEF_BURST_SIZE; + inst->retry_backoff = 1; /* * We only support one server behind this router, since the server is @@ -249,6 +262,10 @@ int i; { inst->initbinlog = atoi(value); } + else if (strcmp(options[i], "file") == 0) + { + inst->initbinlog = atoi(value); + } else if (strcmp(options[i], "lowwater") == 0) { inst->low_water = atoi(value); @@ -257,6 +274,38 @@ int i; { inst->high_water = atoi(value); } + else if (strcmp(options[i], "shortburst") == 0) + { + inst->short_burst = atoi(value); + } + else if (strcmp(options[i], "longburst") == 0) + { + inst->long_burst = atoi(value); + } + else if (strcmp(options[i], "burstsize") == 0) + { + unsigned long size = atoi(value); + char *ptr = value; + while (*ptr && isdigit(*ptr)) + ptr++; + switch (*ptr) + { + case 'G': + case 'g': + size = size * 1024 * 1000 * 1000; + break; + case 'M': + case 'm': + size = size * 1024 * 1000; + break; + case 'K': + case 'k': + size = size * 1024; + break; + } + inst->burst_size = size; + + } else { LOGIF(LE, (skygw_log_write( @@ -284,6 +333,7 @@ int i; inst->active_logs = 0; inst->reconnect_pending = 0; inst->handling_threads = 0; + inst->rotating = 0; inst->residual = NULL; inst->slaves = NULL; inst->next = NULL; @@ -302,6 +352,12 @@ int i; */ blr_init_cache(inst); + if ((name = (char *)malloc(80)) != NULL) + { + sprintf(name, "%s stats", service->name); + hktask_add(name, stats_func, inst, BLR_STATS_FREQ); + } + /* * Now start the replication from the master to MaxScale */ @@ -358,6 +414,8 @@ ROUTER_SLAVE *slave; spinlock_init(&slave->catch_lock); slave->dcb = session->client; slave->router = inst; + slave->file = NULL; + strcpy(slave->binlogfile, "unassigned"); /** * Add this session to the list of active sessions. @@ -477,6 +535,9 @@ ROUTER_SLAVE *slave = (ROUTER_SLAVE *)router_session; */ slave->state = BLRS_UNREGISTERED; + if (slave->file) + blr_close_binlog(router, slave->file); + /* Unlock */ rses_end_locked_router_action(slave); } @@ -541,7 +602,9 @@ diagnostics(ROUTER *router, DCB *dcb) { ROUTER_INSTANCE *router_inst = (ROUTER_INSTANCE *)router; ROUTER_SLAVE *session; -int i = 0; +int i = 0, j; +int minno = 0; +double min5, min10, min15, min30; char buf[40]; struct tm tm; @@ -554,6 +617,30 @@ struct tm tm; } spinlock_release(&router_inst->lock); + minno = router_inst->stats.minno; + min30 = 0.0; + min15 = 0.0; + min10 = 0.0; + min5 = 0.0; + for (j = 0; j < 30; j++) + { + minno--; + if (minno < 0) + minno += 30; + min30 += router_inst->stats.minavgs[minno]; + if (j < 15) + min15 += router_inst->stats.minavgs[minno]; + if (j < 10) + min10 += router_inst->stats.minavgs[minno]; + if (j < 5) + min5 += router_inst->stats.minavgs[minno]; + } + min30 /= 30.0; + min15 /= 15.0; + min10 /= 10.0; + min5 /= 5.0; + + dcb_printf(dcb, "\tMaster connection DCB: %p\n", router_inst->master); dcb_printf(dcb, "\tMaster connection state: %s\n", @@ -574,6 +661,13 @@ struct tm tm; router_inst->stats.n_slaves); dcb_printf(dcb, "\tNumber of binlog events received: %u\n", router_inst->stats.n_binlogs); + minno = router_inst->stats.minno - 1; + if (minno == -1) + minno = 30; + dcb_printf(dcb, "\tNumber of binlog events per minute\n"); + dcb_printf(dcb, "\tCurrent 5 10 15 30 Min Avg\n"); + dcb_printf(dcb, "\t %6d %8.1f %8.1f %8.1f %8.1f\n", + router_inst->stats.minavgs[minno], min5, min10, min15, min30); dcb_printf(dcb, "\tNumber of fake binlog events: %u\n", router_inst->stats.n_fakeevents); dcb_printf(dcb, "\tNumber of artificial binlog events: %u\n", @@ -582,10 +676,6 @@ struct tm tm; router_inst->stats.n_binlog_errors); dcb_printf(dcb, "\tNumber of binlog rotate events: %u\n", router_inst->stats.n_rotates); - dcb_printf(dcb, "\tNumber of binlog cache hits: %u\n", - router_inst->stats.n_cachehits); - dcb_printf(dcb, "\tNumber of binlog cache misses: %u\n", - router_inst->stats.n_cachemisses); dcb_printf(dcb, "\tNumber of heartbeat events: %u\n", router_inst->stats.n_heartbeats); dcb_printf(dcb, "\tNumber of packets received: %u\n", @@ -615,6 +705,8 @@ struct tm tm; spinlock_stats(&instlock, spin_reporter, dcb); dcb_printf(dcb, "\tSpinlock statistics (instance lock):\n"); spinlock_stats(&router_inst->lock, spin_reporter, dcb); + dcb_printf(dcb, "\tSpinlock statistics (binlog position lock):\n"); + spinlock_stats(&router_inst->binlog_lock, spin_reporter, dcb); #endif if (router_inst->slaves) @@ -624,6 +716,29 @@ struct tm tm; session = router_inst->slaves; while (session) { + + minno = session->stats.minno; + min30 = 0.0; + min15 = 0.0; + min10 = 0.0; + min5 = 0.0; + for (j = 0; j < 30; j++) + { + minno--; + if (minno < 0) + minno += 30; + min30 += session->stats.minavgs[minno]; + if (j < 15) + min15 += session->stats.minavgs[minno]; + if (j < 10) + min10 += session->stats.minavgs[minno]; + if (j < 5) + min5 += session->stats.minavgs[minno]; + } + min30 /= 30.0; + min15 /= 15.0; + min10 /= 10.0; + min5 /= 5.0; dcb_printf(dcb, "\t\tServer-id: %d\n", session->serverid); if (session->hostname) dcb_printf(dcb, "\t\tHostname: %s\n", session->hostname); @@ -637,14 +752,18 @@ struct tm tm; dcb_printf(dcb, "\t\tNo. requests: %u\n", session->stats.n_requests); dcb_printf(dcb, "\t\tNo. events sent: %u\n", session->stats.n_events); dcb_printf(dcb, "\t\tNo. bursts sent: %u\n", session->stats.n_bursts); + minno = session->stats.minno - 1; + if (minno == -1) + minno = 30; + dcb_printf(dcb, "\t\tNumber of binlog events per minute\n"); + dcb_printf(dcb, "\t\tCurrent 5 10 15 30 Min Avg\n"); + dcb_printf(dcb, "\t\t %6d %8.1f %8.1f %8.1f %8.1f\n", + session->stats.minavgs[minno], min5, min10, + min15, min30); dcb_printf(dcb, "\t\tNo. flow control: %u\n", session->stats.n_flows); - dcb_printf(dcb, "\t\tNo. catchup NRs: %u\n", session->stats.n_catchupnr); - dcb_printf(dcb, "\t\tNo. already up to date: %u\n", session->stats.n_alreadyupd); dcb_printf(dcb, "\t\tNo. up to date: %u\n", session->stats.n_upd); - dcb_printf(dcb, "\t\tNo. of low water cbs %u\n", session->stats.n_cb); dcb_printf(dcb, "\t\tNo. of drained cbs %u\n", session->stats.n_dcb); dcb_printf(dcb, "\t\tNo. of low water cbs N/A %u\n", session->stats.n_cbna); - dcb_printf(dcb, "\t\tNo. of events > high water %u\n", session->stats.n_above); dcb_printf(dcb, "\t\tNo. of failed reads %u\n", session->stats.n_failed_read); dcb_printf(dcb, "\t\tNo. of nested distribute events %u\n", session->stats.n_overrun); dcb_printf(dcb, "\t\tNo. of distribute action 1 %u\n", session->stats.n_actions[0]); @@ -652,9 +771,11 @@ struct tm tm; dcb_printf(dcb, "\t\tNo. of distribute action 3 %u\n", session->stats.n_actions[2]); if ((session->cstate & CS_UPTODATE) == 0) { - dcb_printf(dcb, "\t\tSlave is in catchup mode. %s\n", - ((session->cstate & CS_EXPECTCB) == 0 ? "" : - "Waiting for DCB queue to drain.")); + dcb_printf(dcb, "\t\tSlave is in catchup mode. %s%s\n", + ((session->cstate & CS_EXPECTCB) == 0 ? "" : + "Waiting for DCB queue to drain."), + ((session->cstate & CS_BUSY) == 0 ? "" : + " Busy in slave catchup.")); } else @@ -672,7 +793,7 @@ struct tm tm; dcb_printf(dcb, "\tSpinlock statistics (rses_lock):\n"); spinlock_stats(&session->rses_lock, spin_reporter, dcb); #endif - + dcb_printf(dcb, "\n"); session = session->next; } spinlock_release(&router_inst->lock); @@ -718,9 +839,24 @@ ROUTER_INSTANCE *router = (ROUTER_INSTANCE *)instance; static void errorReply(ROUTER *instance, void *router_session, GWBUF *message, DCB *backend_dcb, error_action_t action, bool *succp) { +ROUTER_INSTANCE *router = (ROUTER_INSTANCE *)instance; +int error, len; +char msg[85]; + + len = sizeof(error); + if (getsockopt(router->master->fd, SOL_SOCKET, SO_ERROR, &error, &len) != 0) + { + strerror_r(error, msg, 80); + strcat(msg, " "); + } + else + strcpy(msg, ""); + LOGIF(LE, (skygw_log_write_flush( - LOGFILE_ERROR, "Erorr Reply '%s'", message))); - *succp = false; + LOGFILE_ERROR, "Master connection '%s', %sattempting reconnect to master", + message, msg))); + *succp = true; + blr_master_reconnect(router); } /** to be inline'd */ @@ -776,3 +912,35 @@ static uint8_t getCapabilities(ROUTER *inst, void *router_session) { return 0; } + +/** + * The stats gathering function called from the housekeeper so that we + * can get timed averages of binlog records shippped + * + * @param inst The router instance + */ +static void +stats_func(void *inst) +{ +ROUTER_INSTANCE *router = (ROUTER_INSTANCE *)inst; +ROUTER_SLAVE *slave; + + router->stats.minavgs[router->stats.minno++] + = router->stats.n_binlogs - router->stats.lastsample; + router->stats.lastsample = router->stats.n_binlogs; + if (router->stats.minno == BLR_NSTATS_MINUTES) + router->stats.minno = 0; + + spinlock_acquire(&router->lock); + slave = router->slaves; + while (slave) + { + slave->stats.minavgs[slave->stats.minno++] + = slave->stats.n_events - slave->stats.lastsample; + slave->stats.lastsample = slave->stats.n_events; + if (slave->stats.minno == BLR_NSTATS_MINUTES) + slave->stats.minno = 0; + slave = slave->next; + } + spinlock_release(&router->lock); +} diff --git a/server/modules/routing/binlog/blr_file.c b/server/modules/routing/binlog/blr_file.c index 74aacdc99..ada392742 100644 --- a/server/modules/routing/binlog/blr_file.c +++ b/server/modules/routing/binlog/blr_file.c @@ -50,12 +50,13 @@ #include #include - extern int lm_enabled_logfiles_bitmask; + static void blr_file_create(ROUTER_INSTANCE *router, char *file); static void blr_file_append(ROUTER_INSTANCE *router, char *file); static uint32_t extract_field(uint8_t *src, int bits); +static void blr_log_header(logfile_id_t file, char *msg, uint8_t *ptr); /** * Initialise the binlog file for this instance. MaxScale will look @@ -85,6 +86,8 @@ struct dirent *dp; if (access(path, R_OK) == -1) mkdir(path, 0777); + router->binlogdir = strdup(path); + /* First try to find a binlog file number by reading the directory */ root_len = strlen(router->fileroot); dirp = opendir(path); @@ -146,17 +149,11 @@ blr_file_rotate(ROUTER_INSTANCE *router, char *file, uint64_t pos) static void blr_file_create(ROUTER_INSTANCE *router, char *file) { -char *ptr, path[1024]; +char path[1024]; int fd; unsigned char magic[] = BINLOG_MAGIC; - strcpy(path, "/usr/local/skysql/MaxScale"); - if ((ptr = getenv("MAXSCALE_HOME")) != NULL) - { - strcpy(path, ptr); - } - strcat(path, "/"); - strcat(path, router->service->name); + strcpy(path, router->binlogdir); strcat(path, "/"); strcat(path, file); @@ -167,12 +164,14 @@ unsigned char magic[] = BINLOG_MAGIC; else { LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, - "Failed to create binlog file %s\n", path))); + "Failed to create binlog file %s", path))); } fsync(fd); close(router->binlog_fd); + spinlock_acquire(&router->binlog_lock); strcpy(router->binlog_name, file); router->binlog_position = 4; /* Initial position after the magic number */ + spinlock_release(&router->binlog_lock); router->binlog_fd = fd; } @@ -186,30 +185,26 @@ unsigned char magic[] = BINLOG_MAGIC; static void blr_file_append(ROUTER_INSTANCE *router, char *file) { -char *ptr, path[1024]; +char path[1024]; int fd; - strcpy(path, "/usr/local/skysql/MaxScale"); - if ((ptr = getenv("MAXSCALE_HOME")) != NULL) - { - strcpy(path, ptr); - } - strcat(path, "/"); - strcat(path, router->service->name); + strcpy(path, router->binlogdir); strcat(path, "/"); strcat(path, file); if ((fd = open(path, O_RDWR|O_APPEND, 0666)) == -1) { LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, - "Failed to open binlog file %s for append.\n", + "Failed to open binlog file %s for append.", path))); return; } fsync(fd); close(router->binlog_fd); + spinlock_acquire(&router->binlog_lock); strcpy(router->binlog_name, file); router->binlog_position = lseek(fd, 0L, SEEK_END); + spinlock_release(&router->binlog_lock); router->binlog_fd = fd; } @@ -224,7 +219,10 @@ void blr_write_binlog_record(ROUTER_INSTANCE *router, REP_HEADER *hdr, uint8_t *buf) { pwrite(router->binlog_fd, buf, hdr->event_size, hdr->next_pos - hdr->event_size); + spinlock_acquire(&router->binlog_lock); router->binlog_position = hdr->next_pos; + router->last_written = hdr->next_pos - hdr->event_size; + spinlock_release(&router->binlog_lock); } /** @@ -238,97 +236,272 @@ blr_file_flush(ROUTER_INSTANCE *router) fsync(router->binlog_fd); } -int +/** + * Open a binlog file for reading binlog records + * + * @param router The router instance + * @param binlog The binlog filename + * @return a binlog file record + */ +BLFILE * blr_open_binlog(ROUTER_INSTANCE *router, char *binlog) { char *ptr, path[1024]; -int rval; +BLFILE *file; - strcpy(path, "/usr/local/skysql/MaxScale"); - if ((ptr = getenv("MAXSCALE_HOME")) != NULL) + spinlock_acquire(&router->fileslock); + file = router->files; + while (file && strcmp(file->binlogname, binlog) != 0) + file = file->next; + + if (file) { - strcpy(path, ptr); + file->refcnt++; + spinlock_release(&router->fileslock); + return file; } - strcat(path, "/"); - strcat(path, router->service->name); + + if ((file = (BLFILE *)malloc(sizeof(BLFILE))) == NULL) + { + spinlock_release(&router->fileslock); + return NULL; + } + strcpy(file->binlogname, binlog); + file->refcnt = 1; + file->cache = 0; + spinlock_init(&file->lock); + + strcpy(path, router->binlogdir); strcat(path, "/"); strcat(path, binlog); - if ((rval = open(path, O_RDONLY, 0666)) == -1) + if ((file->fd = open(path, O_RDONLY, 0666)) == -1) { LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, - "Failed to open binlog file %s\n", path))); + "Failed to open binlog file %s", path))); + free(file); + spinlock_release(&router->fileslock); + return NULL; } - return rval; + file->next = router->files; + router->files = file; + spinlock_release(&router->fileslock); + + return file; } /** * Read a replication event into a GWBUF structure. * - * @param fd File descriptor of the binlog file - * @param pos Position of binlog record to read - * @param hdr Binlog header to populate - * @return The binlog record wrapped in a GWBUF structure + * @param router The router instance + * @param file File record + * @param pos Position of binlog record to read + * @param hdr Binlog header to populate + * @return The binlog record wrapped in a GWBUF structure */ GWBUF * -blr_read_binlog(int fd, unsigned int pos, REP_HEADER *hdr) +blr_read_binlog(ROUTER_INSTANCE *router, BLFILE *file, unsigned int pos, REP_HEADER *hdr) { uint8_t hdbuf[19]; GWBUF *result; unsigned char *data; int n; +unsigned long filelen = 0; +struct stat statb; - if (lseek(fd, pos, SEEK_SET) != pos) + if (fstat(file->fd, &statb) == 0) + filelen = statb.st_size; + if (pos >= filelen) { - LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, - "Failed to seek for binlog entry, " - "at %d.\n", pos))); + LOGIF(LD, (skygw_log_write(LOGFILE_ERROR, + "Attempting to read off the end of the binlog file %s, " + "event at %lu.", file->binlogname, pos))); return NULL; } + if (strcmp(router->binlog_name, file->binlogname) == 0 && + pos >= router->binlog_position) + { + return NULL; + } + + /* Read the header information from the file */ - if ((n = read(fd, hdbuf, 19)) != 19) + if ((n = pread(file->fd, hdbuf, 19, pos)) != 19) { - LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, - "Failed to read header for binlog entry, " - "at %d (%s).\n", pos, strerror(errno)))); - if (n> 0 && n < 19) + switch (n) + { + case 0: + LOGIF(LD, (skygw_log_write(LOGFILE_DEBUG, + "Reached end of binlog file at %d.", + pos))); + break; + case -1: + LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, + "Failed to read binlog file %s at position %d" + " (%s).", file->binlogname, + pos, strerror(errno)))); + if (errno == EBADF) + LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, + "Bad file descriptor in read binlog for file %s" + ", reference count is %d, descriptor %d.", + file->binlogname, file->refcnt, file->fd))); + break; + default: LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, "Short read when reading the header. " - "Expected 19 bytes got %d bytes.\n", - n))); + "Expected 19 bytes but got %d bytes. " + "Binlog file is %s, position %d", + file->binlogname, pos, n))); + break; + } return NULL; } - hdr->timestamp = extract_field(hdbuf, 32); + hdr->timestamp = EXTRACT32(hdbuf); hdr->event_type = hdbuf[4]; - hdr->serverid = extract_field(&hdbuf[5], 32); + hdr->serverid = EXTRACT32(&hdbuf[5]); hdr->event_size = extract_field(&hdbuf[9], 32); - hdr->next_pos = extract_field(&hdbuf[13], 32); - hdr->flags = extract_field(&hdbuf[17], 16); + hdr->next_pos = EXTRACT32(&hdbuf[13]); + hdr->flags = EXTRACT16(&hdbuf[17]); + if (hdr->next_pos < pos && hdr->event_type != ROTATE_EVENT) + { + LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, + "Next position in header appears to be incorrect " + "rereading event header at pos %ul in file %s, " + "file size is %ul. Master will write %ul in %s next.", + pos, file->binlogname, filelen, router->binlog_position, + router->binlog_name))); + if ((n = pread(file->fd, hdbuf, 19, pos)) != 19) + { + switch (n) + { + case 0: + LOGIF(LD, (skygw_log_write(LOGFILE_DEBUG, + "Reached end of binlog file at %d.", + pos))); + break; + case -1: + LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, + "Failed to read binlog file %s at position %d" + " (%s).", file->binlogname, + pos, strerror(errno)))); + if (errno == EBADF) + LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, + "Bad file descriptor in read binlog for file %s" + ", reference count is %d, descriptor %d.", + file->binlogname, file->refcnt, file->fd))); + break; + default: + LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, + "Short read when reading the header. " + "Expected 19 bytes but got %d bytes. " + "Binlog file is %s, position %d", + file->binlogname, pos, n))); + break; + } + return NULL; + } + hdr->timestamp = EXTRACT32(hdbuf); + hdr->event_type = hdbuf[4]; + hdr->serverid = EXTRACT32(&hdbuf[5]); + hdr->event_size = extract_field(&hdbuf[9], 32); + hdr->next_pos = EXTRACT32(&hdbuf[13]); + hdr->flags = EXTRACT16(&hdbuf[17]); + + if (hdr->next_pos < pos && hdr->event_type != ROTATE_EVENT) + { + LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, + "Next position still incorrect after " + "rereading"))); + return NULL; + } + else + { + LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, + "Next position corrected by " + "rereading"))); + } + } if ((result = gwbuf_alloc(hdr->event_size)) == NULL) { LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, "Failed to allocate memory for binlog entry, " - "size %d at %d.\n", + "size %d at %d.", hdr->event_size, pos))); return NULL; } data = GWBUF_DATA(result); memcpy(data, hdbuf, 19); // Copy the header in - if ((n = read(fd, &data[19], hdr->event_size - 19)) + if ((n = pread(file->fd, &data[19], hdr->event_size - 19, pos + 19)) != hdr->event_size - 19) // Read the balance { - LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, - "Short read when reading the event at %d. " - "Expected %d bytes got %d bytes.\n", - pos, n))); + if (n == -1) + { + LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, + "Error reading the event at %ld in %s. " + "%s, expected %d bytes.", + pos, file->binlogname, + strerror(errno), hdr->event_size - 19))); + } + else + { + LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, + "Short read when reading the event at %ld in %s. " + "Expected %d bytes got %d bytes.", + pos, file->binlogname, hdr->event_size - 19, n))); + if (filelen != 0 && filelen - pos < hdr->event_size) + { + LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, + "Binlog event is close to the end of the binlog file, " + "current file size is %u.", + filelen))); + } + blr_log_header(LOGFILE_ERROR, "Possible malformed event header", hdbuf); + } gwbuf_consume(result, hdr->event_size); return NULL; } return result; } +/** + * Close a binlog file that has been opened to read binlog records + * + * The open binlog files are shared between multiple slaves that are + * reading the same binlog file. + * + * @param router The router instance + * @param file The file to close + */ +void +blr_close_binlog(ROUTER_INSTANCE *router, BLFILE *file) +{ + spinlock_acquire(&file->lock); + file->refcnt--; + if (file->refcnt == 0) + { + spinlock_acquire(&router->fileslock); + if (router->files == file) + router->files = file->next; + else + { + BLFILE *ptr = router->files; + while (ptr && ptr->next != file) + ptr = ptr->next; + if (ptr) + ptr->next = file->next; + } + spinlock_release(&router->fileslock); + + close(file->fd); + file->fd = -1; + } + spinlock_release(&file->lock); + if (file->refcnt == 0) + free(file); +} + /** * Extract a numeric field from a packet of the specified number of bits * @@ -348,3 +521,40 @@ uint32_t rval = 0, shift = 0; } return rval; } + +/** + * Log the event header of binlog event + * + * @param file The log file into which to write the entry + * @param msg A message strign to preceed the header with + * @param ptr The event header raw data + */ +static void +blr_log_header(logfile_id_t file, char *msg, uint8_t *ptr) +{ +char buf[400], *bufp; +int i; + + bufp = buf; + bufp += sprintf(bufp, "%s: ", msg); + for (i = 0; i < 19; i++) + bufp += sprintf(bufp, "0x%02x ", ptr[i]); + skygw_log_write_flush(file, "%s", buf); + +} + +/** + * Return the size of the current binlog file + * + * @param file The binlog file + * @return The current size of the binlog file + */ +unsigned long +blr_file_size(BLFILE *file) +{ +struct stat statb; + + if (fstat(file->fd, &statb) == 0) + return statb.st_size; + return 0; +} diff --git a/server/modules/routing/binlog/blr_master.c b/server/modules/routing/binlog/blr_master.c index 684b80a0b..bb43c1a27 100644 --- a/server/modules/routing/binlog/blr_master.c +++ b/server/modules/routing/binlog/blr_master.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include @@ -60,39 +61,6 @@ /* Temporary requirement for auth data */ #include -#define SAMPLE_COUNT 10000 -CYCLES samples[10][SAMPLE_COUNT]; -int sample_index[10] = { 0, 0, 0 }; - -#define LOGD_SLAVE_CATCHUP1 0 -#define LOGD_SLAVE_CATCHUP2 1 -#define LOGD_DISTRIBUTE 2 -#define LOGD_FILE_FLUSH 3 - -SPINLOCK logspin = SPINLOCK_INIT; - -void -log_duration(int sample, CYCLES duration) -{ -char fname[100]; -int i; -FILE *fp; - - spinlock_acquire(&logspin); - samples[sample][sample_index[sample]++] = duration; - if (sample_index[sample] == SAMPLE_COUNT) - { - sprintf(fname, "binlog_profile.%d", sample); - if ((fp = fopen(fname, "a")) != NULL) - { - for (i = 0; i < SAMPLE_COUNT; i++) - fprintf(fp, "%ld\n", samples[sample][i]); - fclose(fp); - } - sample_index[sample] = 0; - } - spinlock_release(&logspin); -} extern int lm_enabled_logfiles_bitmask; @@ -126,7 +94,7 @@ GWBUF *buf; if ((client = dcb_alloc(DCB_ROLE_INTERNAL)) == NULL) { LOGIF(LE, (skygw_log_write_flush(LOGFILE_ERROR, - "Binlog router: failed to create DCB for dummy client\n"))); + "Binlog router: failed to create DCB for dummy client"))); return; } router->client = client; @@ -134,14 +102,20 @@ GWBUF *buf; if ((router->session = session_alloc(router->service, client)) == NULL) { LOGIF(LE, (skygw_log_write_flush(LOGFILE_ERROR, - "Binlog router: failed to create session for connection to master\n"))); + "Binlog router: failed to create session for connection to master"))); return; } client->session = router->session; if ((router->master = dcb_connect(router->service->databases, router->session, BLR_PROTOCOL)) == NULL) { + char *name = malloc(strlen(router->service->name) + strlen(" Master") + 1); + sprintf(name, "%s Master", router->service->name); + hktask_oneshot(name, blr_start_master, router, + BLR_MASTER_BACKOFF_TIME * router->retry_backoff++); + if (router->retry_backoff > BLR_MAX_BACKOFF) + router->retry_backoff = 1; LOGIF(LE, (skygw_log_write_flush(LOGFILE_ERROR, - "Binlog router: failed to connect to master server '%s'\n", + "Binlog router: failed to connect to master server '%s'", router->service->databases->unique_name))); return; } @@ -155,6 +129,7 @@ perror("setsockopt"); router->master_state = BLRM_TIMESTAMP; router->stats.n_masterstarts++; + router->retry_backoff = 1; } /** @@ -170,9 +145,7 @@ blr_restart_master(ROUTER_INSTANCE *router) { GWBUF *ptr; - dcb_close(router->master); - dcb_free(router->master); - dcb_free(router->client); + dcb_close(router->client); /* Discard the queued residual data */ ptr = router->residual; @@ -252,7 +225,7 @@ char query[128]; if (router->master_state < 0 || router->master_state > BLRM_MAXSTATE) { LOGIF(LE, (skygw_log_write( - LOGFILE_ERROR, "Invalid master state machine state (%d) for binlog router.\n", + LOGFILE_ERROR, "Invalid master state machine state (%d) for binlog router.", router->master_state))); gwbuf_consume(buf, gwbuf_length(buf)); spinlock_acquire(&router->lock); @@ -274,7 +247,7 @@ char query[128]; { LOGIF(LE, (skygw_log_write( LOGFILE_ERROR, - "Received error: %d, %s from master during %s phase of the master state machine.\n", + "Received error: %d, %s from master during %s phase of the master state machine.", MYSQL_ERROR_CODE(buf), MYSQL_ERROR_MSG(buf), blrm_states[router->master_state] ))); gwbuf_consume(buf, gwbuf_length(buf)); @@ -548,23 +521,23 @@ static REP_HEADER phdr; /* Get the length of the packet from the residual and new packet */ if (reslen >= 3) { - len = extract_field(pdata, 24); + len = EXTRACT24(pdata); } else if (reslen == 2) { - len = extract_field(pdata, 16); - len |= (extract_field(GWBUF_DATA(pkt->next), 8) << 16); + len = EXTRACT16(pdata); + len |= (*(uint8_t *)GWBUF_DATA(pkt->next) << 16); } else if (reslen == 1) { - len = extract_field(pdata, 8); - len |= (extract_field(GWBUF_DATA(pkt->next), 16) << 8); + len = *pdata; + len |= (EXTRACT16(GWBUF_DATA(pkt->next)) << 8); } len += 4; // Allow space for the header } else { - len = extract_field(pdata, 24) + 4; + len = EXTRACT24(pdata) + 4; } if (reslen < len && pkt_length >= len) @@ -585,7 +558,7 @@ static REP_HEADER phdr; LOGIF(LE,(skygw_log_write( LOGFILE_ERROR, "Insufficient memory to buffer event " - "of %d bytes. Binlog %s @ %d\n.", + "of %d bytes. Binlog %s @ %d.", len, router->binlog_name, router->binlog_position))); break; @@ -610,7 +583,7 @@ static REP_HEADER phdr; LOGFILE_ERROR, "Expected entire message in buffer " "chain, but failed to create complete " - "message as expected. %s @ %d\n", + "message as expected. %s @ %d", router->binlog_name, router->binlog_position))); free(msg); @@ -631,7 +604,7 @@ static REP_HEADER phdr; router->stats.n_residuals++; LOGIF(LD,(skygw_log_write( LOGFILE_DEBUG, - "Residual data left after %d records. %s @ %d\n", + "Residual data left after %d records. %s @ %d", router->stats.n_binlogs, router->binlog_name, router->binlog_position))); break; @@ -683,7 +656,7 @@ static REP_HEADER phdr; // #define SHOW_EVENTS #ifdef SHOW_EVENTS - printf("blr: event type 0x%02x, flags 0x%04x, event size %d\n", hdr.event_type, hdr.flags, hdr.event_size); + printf("blr: event type 0x%02x, flags 0x%04x, event size %d", hdr.event_type, hdr.flags, hdr.event_size); #endif if (hdr.event_type >= 0 && hdr.event_type < 0x24) router->stats.events[hdr.event_type]++; @@ -692,7 +665,7 @@ static REP_HEADER phdr; // Fake format description message LOGIF(LD,(skygw_log_write(LOGFILE_DEBUG, "Replication fake event. " - "Binlog %s @ %d.\n", + "Binlog %s @ %d.", router->binlog_name, router->binlog_position))); router->stats.n_fakeevents++; @@ -721,7 +694,7 @@ static REP_HEADER phdr; LOGIF(LD,(skygw_log_write( LOGFILE_DEBUG, "Replication heartbeat. " - "Binlog %s @ %d.\n", + "Binlog %s @ %d.", router->binlog_name, router->binlog_position))); router->stats.n_heartbeats++; @@ -730,6 +703,8 @@ static REP_HEADER phdr; { ptr = ptr + 5; // We don't put the first byte of the payload // into the binlog file + if (hdr.event_type == ROTATE_EVENT) + router->rotating = 1; blr_write_binlog_record(router, &hdr, ptr); if (hdr.event_type == ROTATE_EVENT) { @@ -745,7 +720,7 @@ static REP_HEADER phdr; "Artificial event not written " "to disk or distributed. " "Type 0x%x, Length %d, Binlog " - "%s @ %d\n.", + "%s @ %d.", hdr.event_type, hdr.event_size, router->binlog_name, @@ -753,6 +728,7 @@ static REP_HEADER phdr; ptr += 5; if (hdr.event_type == ROTATE_EVENT) { + router->rotating = 1; blr_rotate_event(router, ptr, &hdr); } } @@ -762,7 +738,7 @@ static REP_HEADER phdr; { printf("Binlog router error: %s\n", &ptr[7]); LOGIF(LE,(skygw_log_write(LOGFILE_ERROR, - "Error packet in binlog stream.%s @ %d\n.", + "Error packet in binlog stream.%s @ %d.", router->binlog_name, router->binlog_position))); blr_log_packet(LOGFILE_ERROR, "Error Packet:", @@ -802,9 +778,7 @@ static REP_HEADER phdr; { ss_dassert(pkt_length == 0); } -{ CYCLES start = rdtsc(); blr_file_flush(router); -log_duration(LOGD_FILE_FLUSH, rdtsc() - start); } } /** @@ -814,25 +788,25 @@ log_duration(LOGD_FILE_FLUSH, rdtsc() - start); } * @param hdr The packet header to populate */ void -blr_extract_header(uint8_t *ptr, REP_HEADER *hdr) +blr_extract_header(register uint8_t *ptr, register REP_HEADER *hdr) { - hdr->payload_len = extract_field(ptr, 24); + hdr->payload_len = EXTRACT24(ptr); hdr->seqno = ptr[3]; hdr->ok = ptr[4]; - hdr->timestamp = extract_field(&ptr[5], 32); + hdr->timestamp = EXTRACT32(&ptr[5]); hdr->event_type = ptr[9]; - hdr->serverid = extract_field(&ptr[10], 32); - hdr->event_size = extract_field(&ptr[14], 32); - hdr->next_pos = extract_field(&ptr[18], 32); - hdr->flags = extract_field(&ptr[22], 16); + hdr->serverid = EXTRACT32(&ptr[10]); + hdr->event_size = EXTRACT32(&ptr[14]); + hdr->next_pos = EXTRACT32(&ptr[18]); + hdr->flags = EXTRACT16(&ptr[22]); } /** * Extract a numeric field from a packet of the specified number of bits * * @param src The raw packet source - * @param birs The number of bits to extract (multiple of 8) + * @param bits The number of bits to extract (multiple of 8) */ inline uint32_t extract_field(register uint8_t *src, int bits) @@ -881,11 +855,13 @@ char file[BINLOG_FNAMELEN+1]; printf("New file: %s @ %ld\n", file, pos); #endif + strcpy(router->prevbinlog, router->binlog_name); if (strncmp(router->binlog_name, file, slen) != 0) { router->stats.n_rotates++; blr_file_rotate(router, file, pos); } + router->rotating = 0; } /** @@ -927,28 +903,35 @@ blr_distribute_binlog_record(ROUTER_INSTANCE *router, REP_HEADER *hdr, uint8_t * { GWBUF *pkt; uint8_t *buf; -ROUTER_SLAVE *slave; +ROUTER_SLAVE *slave, *nextslave; int action; -CYCLES entry; - entry = rdtsc(); spinlock_acquire(&router->lock); slave = router->slaves; while (slave) { - spinlock_acquire(&slave->catch_lock); - if ((slave->cstate & (CS_UPTODATE|CS_DIST)) == CS_UPTODATE) + if (slave->state != BLRS_DUMPING) { - /* Slave is up to date with the binlog and no distribute is - * running on this slave. + slave = slave->next; + continue; + } + spinlock_acquire(&slave->catch_lock); + if ((slave->cstate & (CS_UPTODATE|CS_BUSY)) == CS_UPTODATE) + { + /* + * This slave is reporting it is to date with the binlog of the + * master running on this slave. + * It has no thread running currently that is sending binlog + * events. */ action = 1; - slave->cstate |= CS_DIST; + slave->cstate |= CS_BUSY; } - else if ((slave->cstate & (CS_UPTODATE|CS_DIST)) == (CS_UPTODATE|CS_DIST)) + else if ((slave->cstate & (CS_UPTODATE|CS_BUSY)) == (CS_UPTODATE|CS_BUSY)) { - /* Slave is up to date with the binlog and a distribute is - * running on this slave. + /* + * The slave is up to date with the binlog and a process is + * running on this slave to send binlog events. */ slave->overrun = 1; action = 2; @@ -960,12 +943,20 @@ CYCLES entry; } slave->stats.n_actions[action-1]++; spinlock_release(&slave->catch_lock); + if (action == 1) { - if ((slave->binlog_pos == hdr->next_pos - hdr->event_size) - && (strcmp(slave->binlogfile, router->binlog_name) == 0 || - hdr->event_type == ROTATE_EVENT)) + if (slave->binlog_pos == router->last_written && + (strcmp(slave->binlogfile, router->binlog_name) == 0 || + (hdr->event_type == ROTATE_EVENT && + strcmp(slave->binlogfile, router->prevbinlog)))) { + /* + * The slave should be up to date, check that the binlog + * position matches the event we have to distribute or + * this is a rotate event. Send the event directly from + * memory to the slave. + */ pkt = gwbuf_alloc(hdr->event_size + 5); buf = GWBUF_DATA(pkt); encode_value(buf, hdr->event_size + 1, 24); @@ -985,77 +976,90 @@ CYCLES entry; spinlock_acquire(&slave->catch_lock); if (slave->overrun) { -CYCLES cycle_start, cycles; slave->stats.n_overrun++; slave->overrun = 0; - spinlock_release(&router->lock); - slave->cstate &= ~(CS_UPTODATE|CS_DIST); - spinlock_release(&slave->catch_lock); -cycle_start = rdtsc(); - blr_slave_catchup(router, slave); -cycles = rdtsc() - cycle_start; -log_duration(LOGD_SLAVE_CATCHUP2, cycles); - spinlock_acquire(&router->lock); - slave = router->slaves; - if (slave) - continue; - else - break; + poll_fake_write_event(slave->dcb); } else { - slave->cstate &= ~CS_DIST; + slave->cstate &= ~CS_BUSY; } spinlock_release(&slave->catch_lock); } + else if (slave->binlog_pos == hdr->next_pos + && strcmp(slave->binlogfile, router->binlog_name) == 0) + { + /* + * Slave has already read record from file, no + * need to distrbute this event + */ + spinlock_acquire(&slave->catch_lock); + slave->cstate &= ~CS_BUSY; + spinlock_release(&slave->catch_lock); + } else if ((slave->binlog_pos > hdr->next_pos - hdr->event_size) && strcmp(slave->binlogfile, router->binlog_name) == 0) { + /* + * The slave is ahead of the master, this should never + * happen. Force the slave to catchup mode in order to + * try to resolve the issue. + */ LOGIF(LE, (skygw_log_write_flush(LOGFILE_ERROR, "Slave %d is ahead of expected position %s@%d. " "Expected position %d", slave->serverid, slave->binlogfile, slave->binlog_pos, hdr->next_pos - hdr->event_size))); + spinlock_acquire(&slave->catch_lock); + slave->cstate &= ~(CS_UPTODATE|CS_BUSY); + slave->cstate |= CS_EXPECTCB; + spinlock_release(&slave->catch_lock); + poll_fake_write_event(slave->dcb); } - else if ((hdr->event_type != ROTATE_EVENT) - && (slave->binlog_pos != hdr->next_pos - hdr->event_size || - strcmp(slave->binlogfile, router->binlog_name) != 0)) + else { - /* Check slave is in catchup mode and if not - * force it to go into catchup mode. + /* + * The slave is not at the position it should be. Force it into + * catchup mode rather than send this event. */ - if (slave->cstate & CS_UPTODATE) - { -CYCLES cycle_start, cycles; - spinlock_release(&router->lock); - LOGIF(LD, (skygw_log_write_flush(LOGFILE_DEBUG, - "Force slave %d into catchup mode %s@%d\n", - slave->serverid, slave->binlogfile, - slave->binlog_pos))); - spinlock_acquire(&slave->catch_lock); - slave->cstate &= ~(CS_UPTODATE|CS_DIST); - spinlock_release(&slave->catch_lock); -cycle_start = rdtsc(); - blr_slave_catchup(router, slave); -cycles = rdtsc() - cycle_start; -log_duration(LOGD_SLAVE_CATCHUP1, cycles); - spinlock_acquire(&router->lock); - slave = router->slaves; - if (slave) - continue; - else - break; - } + spinlock_acquire(&slave->catch_lock); + slave->cstate &= ~(CS_UPTODATE|CS_BUSY); + slave->cstate |= CS_EXPECTCB; + spinlock_release(&slave->catch_lock); + poll_fake_write_event(slave->dcb); } } + else if (action == 3) + { + /* Slave is not up to date + * Check if it is either expecting a callback or + * is busy processing a callback + */ + spinlock_acquire(&slave->catch_lock); + if ((slave->cstate & (CS_EXPECTCB|CS_BUSY)) == 0) + { + slave->cstate |= CS_EXPECTCB; + spinlock_release(&slave->catch_lock); + poll_fake_write_event(slave->dcb); + } + else + spinlock_release(&slave->catch_lock); + } slave = slave->next; } spinlock_release(&router->lock); - log_duration(LOGD_DISTRIBUTE, rdtsc() - entry); } +/** + * Write a raw event (the first 40 bytes at most) to a log file + * + * @param file The logfile to write to + * @param msg A textual message to write before the packet + * @param ptr Pointer to the message buffer + * @param len Length of message packet + */ static void blr_log_packet(logfile_id_t file, char *msg, uint8_t *ptr, int len) { @@ -1067,8 +1071,21 @@ int i; for (i = 0; i < len && i < 40; i++) bufp += sprintf(bufp, "0x%02x ", ptr[i]); if (i < len) - skygw_log_write_flush(file, "%s...\n", buf); + skygw_log_write_flush(file, "%s...", buf); else - skygw_log_write_flush(file, "%s\n", buf); + skygw_log_write_flush(file, "%s", buf); } + +/** + * Check if the master connection is in place and we + * are downlaoding binlogs + * + * @param router The router instance + * @return non-zero if we are recivign binlog records + */ +int +blr_master_connected(ROUTER_INSTANCE *router) +{ + return router->master_state == BLRM_BINLOGDUMP; +} diff --git a/server/modules/routing/binlog/blr_slave.c b/server/modules/routing/binlog/blr_slave.c index 878c5e157..7a6489e02 100644 --- a/server/modules/routing/binlog/blr_slave.c +++ b/server/modules/routing/binlog/blr_slave.c @@ -47,12 +47,12 @@ #include #include #include +#include #include #include #include - static uint32_t extract_field(uint8_t *src, int bits); static void encode_value(unsigned char *data, unsigned int value, int len); static int blr_slave_query(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave, GWBUF *queue); @@ -61,9 +61,10 @@ static void blr_slave_send_error(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave, c static int blr_slave_send_timestamp(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave); static int blr_slave_register(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave, GWBUF *queue); static int blr_slave_binlog_dump(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave, GWBUF *queue); -int blr_slave_catchup(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave); -static uint8_t *blr_build_header(GWBUF *pkt, REP_HEADER *hdr); -static int blr_slave_callback(DCB *dcb, DCB_REASON reason, void *data); +int blr_slave_catchup(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave, bool large); +uint8_t *blr_build_header(GWBUF *pkt, REP_HEADER *hdr); +int blr_slave_callback(DCB *dcb, DCB_REASON reason, void *data); +static int blr_slave_fake_rotate(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave); extern int lm_enabled_logfiles_bitmask; @@ -89,7 +90,7 @@ blr_slave_request(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave, GWBUF *queue) if (slave->state < 0 || slave->state > BLRS_MAXSTATE) { LOGIF(LE, (skygw_log_write( - LOGFILE_ERROR, "Invalid slave state machine state (%d) for binlog router.\n", + LOGFILE_ERROR, "Invalid slave state machine state (%d) for binlog router.", slave->state))); gwbuf_consume(queue, gwbuf_length(queue)); return 0; @@ -109,13 +110,13 @@ blr_slave_request(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave, GWBUF *queue) break; case COM_QUIT: LOGIF(LD, (skygw_log_write(LOGFILE_DEBUG, - "COM_QUIT received from slave with server_id %d\n", + "COM_QUIT received from slave with server_id %d", slave->serverid))); break; default: LOGIF(LE, (skygw_log_write( LOGFILE_ERROR, - "Unexpected MySQL Command (%d) received from slave\n", + "Unexpected MySQL Command (%d) received from slave", MYSQL_COMMAND(queue)))); break; } @@ -166,7 +167,7 @@ int query_len; query_text = strndup(qtext, query_len); LOGIF(LT, (skygw_log_write( - LOGFILE_TRACE, "Execute statement from the slave '%s'\n", query_text))); + LOGFILE_TRACE, "Execute statement from the slave '%s'", query_text))); /* * Implement a very rudimental "parsing" of the query text by extarcting the * words from the statement and matchng them against the subset of queries we @@ -269,7 +270,7 @@ int query_len; query_text = strndup(qtext, query_len); LOGIF(LE, (skygw_log_write( - LOGFILE_ERROR, "Unexpected query from slave server %s\n", query_text))); + LOGFILE_ERROR, "Unexpected query from slave server %s", query_text))); free(query_text); blr_slave_send_error(router, slave, "Unexpected SQL query received from slave."); return 0; @@ -300,7 +301,7 @@ GWBUF *clone; else { LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, - "Failed to clone server response to send to slave.\n"))); + "Failed to clone server response to send to slave."))); return 0; } } @@ -474,18 +475,19 @@ blr_slave_binlog_dump(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave, GWBUF *queue { GWBUF *resp; uint8_t *ptr; -int len, flags, serverid, rval; +int len, flags, serverid, rval, binlognamelen; REP_HEADER hdr; uint32_t chksum; ptr = GWBUF_DATA(queue); len = extract_field(ptr, 24); + binlognamelen = len - 11; ptr += 4; // Skip length and sequence number if (*ptr++ != COM_BINLOG_DUMP) { LOGIF(LE, (skygw_log_write( LOGFILE_ERROR, - "blr_slave_binlog_dump expected a COM_BINLOG_DUMP but received %d\n", + "blr_slave_binlog_dump expected a COM_BINLOG_DUMP but received %d", *(ptr-1)))); return 0; } @@ -496,15 +498,16 @@ uint32_t chksum; ptr += 2; serverid = extract_field(ptr, 32); ptr += 4; - strncpy(slave->binlogfile, (char *)ptr, BINLOG_FNAMELEN); + strncpy(slave->binlogfile, (char *)ptr, binlognamelen); + slave->binlogfile[binlognamelen] = 0; - slave->state = BLRS_DUMPING; slave->seqno = 1; + if (slave->nocrc) - len = 0x2b; + len = 19 + 8 + binlognamelen; else - len = 0x2f; + len = 19 + 8 + 4 + binlognamelen; // Build a fake rotate event resp = gwbuf_alloc(len + 5); @@ -520,8 +523,8 @@ uint32_t chksum; ptr = blr_build_header(resp, &hdr); encode_value(ptr, slave->binlog_pos, 64); ptr += 8; - memcpy(ptr, slave->binlogfile, BINLOG_FNAMELEN); - ptr += BINLOG_FNAMELEN; + memcpy(ptr, slave->binlogfile, binlognamelen); + ptr += binlognamelen; if (!slave->nocrc) { @@ -567,18 +570,24 @@ uint32_t chksum; slave->dcb->low_water = router->low_water; slave->dcb->high_water = router->high_water; - dcb_add_callback(slave->dcb, DCB_REASON_LOW_WATER, blr_slave_callback, slave); dcb_add_callback(slave->dcb, DCB_REASON_DRAINED, blr_slave_callback, slave); + slave->state = BLRS_DUMPING; + + LOGIF(LM, (skygw_log_write( + LOGFILE_MESSAGE, + "%s: New slave %s requested binlog file %s from position %lu", + router->service->name, slave->dcb->remote, + slave->binlogfile, slave->binlog_pos))); if (slave->binlog_pos != router->binlog_position || strcmp(slave->binlogfile, router->binlog_name) != 0) { spinlock_acquire(&slave->catch_lock); slave->cstate &= ~CS_UPTODATE; + slave->cstate |= CS_EXPECTCB; spinlock_release(&slave->catch_lock); - rval = blr_slave_catchup(router, slave); + poll_fake_write_event(slave->dcb); } - return rval; } @@ -630,7 +639,7 @@ encode_value(unsigned char *data, unsigned int value, int len) * @param hdr The packet header to populate * @return A pointer to the first byte following the event header */ -static uint8_t * +uint8_t * blr_build_header(GWBUF *pkt, REP_HEADER *hdr) { uint8_t *ptr; @@ -660,229 +669,225 @@ uint8_t *ptr; * We have a registered slave that is behind the current leading edge of the * binlog. We must replay the log entries to bring this node up to speed. * - * There may be a large numebr of records to send to the slave, the process + * There may be a large number of records to send to the slave, the process * is triggered by the slave COM_BINLOG_DUMP message and all the events must * be sent without receiving any new event. This measn there is no trigger into * MaxScale other than this initial message. However, if we simply send all the - * events we end up with an extremely long write queue on the DCB and risk running - * the server out of resources. + * events we end up with an extremely long write queue on the DCB and risk + * running the server out of resources. * - * To resolve this the concept of high and low water marks within the DCB has been - * added, with the ability for the DCB code to call user defined callbacks when the - * write queue is completely drained, when it crosses above the high water mark and - * when it crosses below the low water mark. - * - * The blr_slave_catchup routine will send binlog events to the slave until the high - * water mark is reached, at which point it will return. Later, when a low water mark - * callback is generated by the code that drains the DCB of data the blr_slave_catchup - * routine will again be called to write more events. The process is repeated until - * the slave has caught up with the master. + * The slave catchup routine will send a burst of replication events per single + * call. The paramter "long" control the number of events in the burst. The + * short burst is intended to be used when the master receive an event and + * needs to put the slave into catchup mode. This prevents the slave taking + * too much tiem away from the thread that is processing the master events. * - * Note: an additional check that the DCB is still above the low water mark is done - * prior to the return from this function to allow for any delays due to the call to - * the close system call, since this may cause thread rescheduling. + * At the end of the burst a fake EPOLLOUT event is added to the poll event + * queue. This ensures that the slave callback for processing DCB write drain + * will be called and future catchup requests will be handled on another thread. * * @param router The binlog router * @param slave The slave that is behind + * @param large Send a long or short burst of events * @return The number of bytes written */ int -blr_slave_catchup(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave) +blr_slave_catchup(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave, bool large) { GWBUF *head, *record; REP_HEADER hdr; -int written, fd, rval = 1, burst = 0; +int written, rval = 1, burst; +int rotating; +unsigned long burst_size; uint8_t *ptr; -struct timespec req; - + if (large) + burst = router->long_burst; + else + burst = router->short_burst; + burst_size = router->burst_size; spinlock_acquire(&slave->catch_lock); - slave->cstate &= ~CS_EXPECTCB; - spinlock_release(&slave->catch_lock); -doitagain: - /* - * We have a slightly complex syncronisation mechansim here, - * we need to make sure that we do not have multiple threads - * running the catchup loop, but we need to be very careful - * that we do not loose a call that is coming via a callback - * call as this will stall the binlog catchup process. - * - * We don't want to simply use a traditional mutex here for - * the loop, since this would block a MaxScale thread for - * an unacceptable length of time. - * - * We have two status bits, the CS_READING that says we are - * in the outer loop and the CS_INNERLOOP, to say we are in - * the inner loop. - * - * If just CS_READING is set the other thread may be about to - * enter the inner loop or may be about to exit the function - * completely. Therefore we have to wait to see if CS_READING - * is cleared or CS_INNERLOOP is set. - * - * If CS_READING gets cleared then this thread should proceed - * into the loop. - * - * If CS_INNERLOOP get's set then this thread does not need to - * proceed. - * - * If CS_READING is not set then this thread simply enters the - * loop. - */ - req.tv_sec = 0; - req.tv_nsec = 1000; - spinlock_acquire(&slave->catch_lock); - if (slave->cstate & CS_UPTODATE) + if (slave->cstate & CS_BUSY) { - LOGIF(LM, (skygw_log_write(LOGFILE_MESSAGE, - "blr_slave_catchup called with up to date slave %d at " - "%s@%d. Reading position %s@%d\n", - slave->serverid, slave->binlogfile, - slave->binlog_pos, router->binlog_name, - router->binlog_position))); - slave->stats.n_alreadyupd++; spinlock_release(&slave->catch_lock); - return 1; + return 0; } - while (slave->cstate & CS_READING) - { - // Wait until we know what the other thread is doing - while ((slave->cstate & (CS_READING|CS_INNERLOOP)) == CS_READING) - { - spinlock_release(&slave->catch_lock); - nanosleep(&req, NULL); - spinlock_acquire(&slave->catch_lock); - } - // Other thread is in the innerloop - if ((slave->cstate & (CS_READING|CS_INNERLOOP)) == (CS_READING|CS_INNERLOOP)) - { - spinlock_release(&slave->catch_lock); - LOGIF(LM, (skygw_log_write( - LOGFILE_MESSAGE, - "blr_slave_catchup thread returning due to " - "lock being held by another thread. %s@%d\n", - slave->binlogfile, - slave->binlog_pos))); - slave->stats.n_catchupnr++; - return 1; // We cheat here and return 1 because otherwise - // an error would be sent and we do not want that - } - - /* Release the lock for a short time to allow the other - * thread to exit the outer reading loop. - */ - spinlock_release(&slave->catch_lock); - nanosleep(&req, NULL); - spinlock_acquire(&slave->catch_lock); - } - if (slave->pthread) - LOGIF(LD, (skygw_log_write(LOGFILE_DEBUG, "Multiple threads sending to same thread.\n"))); - slave->pthread = pthread_self(); - slave->cstate |= CS_READING; + slave->cstate |= CS_BUSY; spinlock_release(&slave->catch_lock); - if (DCB_ABOVE_HIGH_WATER(slave->dcb)) - LOGIF(LT, (skygw_log_write(LOGFILE_TRACE, "blr_slave_catchup above high water on entry.\n"))); - - do { - if ((fd = blr_open_binlog(router, slave->binlogfile)) == -1) + if (slave->file == NULL) + { + rotating = router->rotating; + if ((slave->file = blr_open_binlog(router, slave->binlogfile)) == NULL) { - spinlock_acquire(&slave->catch_lock); - slave->cstate &= ~CS_READING; - spinlock_release(&slave->catch_lock); - LOGIF(LE, (skygw_log_write( + if (rotating) + { + spinlock_acquire(&slave->catch_lock); + slave->cstate |= CS_EXPECTCB; + slave->cstate &= ~CS_BUSY; + spinlock_release(&slave->catch_lock); + poll_fake_write_event(slave->dcb); + return rval; + } + LOGIF(LE, (skygw_log_write( LOGFILE_ERROR, - "blr_slave_catchup failed to open binlog file %s\n", + "blr_slave_catchup failed to open binlog file %s", slave->binlogfile))); + slave->cstate &= ~CS_BUSY; + slave->state = BLRS_ERRORED; + dcb_close(slave->dcb); return 0; } - slave->stats.n_bursts++; - spinlock_acquire(&slave->catch_lock); - slave->cstate |= CS_INNERLOOP; - spinlock_release(&slave->catch_lock); - while ((!DCB_ABOVE_HIGH_WATER(slave->dcb)) && - (record = blr_read_binlog(fd, slave->binlog_pos, &hdr)) != NULL) + } + slave->stats.n_bursts++; + while (burst-- && burst_size > 0 && + (record = blr_read_binlog(router, slave->file, slave->binlog_pos, &hdr)) != NULL) + { + head = gwbuf_alloc(5); + ptr = GWBUF_DATA(head); + encode_value(ptr, hdr.event_size + 1, 24); + ptr += 3; + *ptr++ = slave->seqno++; + *ptr++ = 0; // OK + head = gwbuf_append(head, record); + if (hdr.event_type == ROTATE_EVENT) { -if (hdr.event_size > DEF_HIGH_WATER) slave->stats.n_above++; - head = gwbuf_alloc(5); - ptr = GWBUF_DATA(head); - encode_value(ptr, hdr.event_size + 1, 24); - ptr += 3; - *ptr++ = slave->seqno++; - *ptr++ = 0; // OK - head = gwbuf_append(head, record); - if (hdr.event_type == ROTATE_EVENT) +unsigned long beat1 = hkheartbeat; + blr_close_binlog(router, slave->file); +if (hkheartbeat - beat1 > 1) LOGIF(LE, (skygw_log_write( + LOGFILE_ERROR, "blr_close_binlog took %d beats", + hkheartbeat - beat1))); + blr_slave_rotate(slave, GWBUF_DATA(record)); +beat1 = hkheartbeat; + if ((slave->file = blr_open_binlog(router, slave->binlogfile)) == NULL) { - close(fd); - blr_slave_rotate(slave, GWBUF_DATA(record)); - if ((fd = blr_open_binlog(router, slave->binlogfile)) == -1) + if (rotating) { - LOGIF(LE, (skygw_log_write( - LOGFILE_ERROR, - "blr_slave_catchup failed to open binlog file %s\n", - slave->binlogfile))); - break; + spinlock_acquire(&slave->catch_lock); + slave->cstate |= CS_EXPECTCB; + slave->cstate &= ~CS_BUSY; + spinlock_release(&slave->catch_lock); + poll_fake_write_event(slave->dcb); + return rval; } + LOGIF(LE, (skygw_log_write( + LOGFILE_ERROR, + "blr_slave_catchup failed to open binlog file %s", + slave->binlogfile))); + slave->state = BLRS_ERRORED; + dcb_close(slave->dcb); + break; } - written = slave->dcb->func.write(slave->dcb, head); - if (written && hdr.event_type != ROTATE_EVENT) - { - slave->binlog_pos = hdr.next_pos; - } - rval = written; - slave->stats.n_events++; - burst++; +if (hkheartbeat - beat1 > 1) LOGIF(LE, (skygw_log_write( + LOGFILE_ERROR, "blr_open_binlog took %d beats", + hkheartbeat - beat1))); } - if (record == NULL) - slave->stats.n_failed_read++; - spinlock_acquire(&slave->catch_lock); - slave->cstate &= ~CS_INNERLOOP; - spinlock_release(&slave->catch_lock); + written = slave->dcb->func.write(slave->dcb, head); + if (written && hdr.event_type != ROTATE_EVENT) + { + slave->binlog_pos = hdr.next_pos; + } + rval = written; + slave->stats.n_events++; + burst_size -= hdr.event_size; + } + if (record == NULL) + slave->stats.n_failed_read++; + spinlock_acquire(&slave->catch_lock); + slave->cstate &= ~CS_BUSY; + spinlock_release(&slave->catch_lock); - close(fd); - } while (record && DCB_BELOW_LOW_WATER(slave->dcb)); if (record) { slave->stats.n_flows++; spinlock_acquire(&slave->catch_lock); slave->cstate |= CS_EXPECTCB; spinlock_release(&slave->catch_lock); + poll_fake_write_event(slave->dcb); + } + else if (slave->binlog_pos == router->binlog_position && + strcmp(slave->binlogfile, router->binlog_name) == 0) + { + int state_change = 0; + spinlock_acquire(&router->binlog_lock); + spinlock_acquire(&slave->catch_lock); + + /* + * Now check again since we hold the router->binlog_lock + * and slave->catch_lock. + */ + if (slave->binlog_pos != router->binlog_position || + strcmp(slave->binlogfile, router->binlog_name) != 0) + { + slave->cstate &= ~CS_UPTODATE; + slave->cstate |= CS_EXPECTCB; + spinlock_release(&slave->catch_lock); + spinlock_release(&router->binlog_lock); + poll_fake_write_event(slave->dcb); + } + else + { + if ((slave->cstate & CS_UPTODATE) == 0) + { + slave->stats.n_upd++; + slave->cstate |= CS_UPTODATE; + spinlock_release(&slave->catch_lock); + spinlock_release(&router->binlog_lock); + state_change = 1; + } + } + + if (state_change) + { + LOGIF(LM, (skygw_log_write(LOGFILE_MESSAGE, + "%s: Slave %s is up to date %s, %u.", + router->service->name, + slave->dcb->remote, + slave->binlogfile, slave->binlog_pos))); + } } else { - int state_change = 0; - spinlock_acquire(&slave->catch_lock); - if ((slave->cstate & CS_UPTODATE) == 0) + if (slave->binlog_pos >= blr_file_size(slave->file) + && router->rotating == 0 + && strcmp(router->binlog_name, slave->binlogfile) != 0 + && blr_master_connected(router)) { - slave->stats.n_upd++; - slave->cstate |= CS_UPTODATE; - state_change = 1; + /* We may have reached the end of file of a non-current + * binlog file. + * + * Note if the master is rotating there is a window during + * which the rotate event has been written to the old binlog + * but the new binlog file has not yet been created. Therefore + * we ignore these issues during the rotate processing. + */ + LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, + "Slave reached end of file for binlong file %s at %u " + "which is not the file currently being downloaded. " + "Master binlog is %s, %lu.", + slave->binlogfile, slave->binlog_pos, + router->binlog_name, router->binlog_position))); + if (blr_slave_fake_rotate(router, slave)) + { + spinlock_acquire(&slave->catch_lock); + slave->cstate |= CS_EXPECTCB; + spinlock_release(&slave->catch_lock); + poll_fake_write_event(slave->dcb); + } + else + { + slave->state = BLRS_ERRORED; + dcb_close(slave->dcb); + } + } + else + { + spinlock_acquire(&slave->catch_lock); + slave->cstate |= CS_EXPECTCB; + spinlock_release(&slave->catch_lock); + poll_fake_write_event(slave->dcb); } - spinlock_release(&slave->catch_lock); - if (state_change) - LOGIF(LM, (skygw_log_write(LOGFILE_MESSAGE, - "blr_slave_catchup slave is up to date %s, %u\n", - slave->binlogfile, slave->binlog_pos))); } - spinlock_acquire(&slave->catch_lock); -#if 0 -if (slave->pthread != pthread_self()) -{ - LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, "Multple threads in catchup for same slave: %x and %x\n", slave->pthread, pthread_self()))); -abort(); -} -#endif - slave->pthread = 0; -#if 0 -if (DCB_BELOW_LOW_WATER(slave->dcb) && slave->binlog_pos != router->binlog_position) abort(); -#endif - slave->cstate &= ~CS_READING; - spinlock_release(&slave->catch_lock); -if (DCB_BELOW_LOW_WATER(slave->dcb) && slave->binlog_pos != router->binlog_position) -{ - LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, "Expected to be above low water\n"))); -goto doitagain; -} return rval; } @@ -896,7 +901,7 @@ goto doitagain; * @param reason The reason the callback was called * @param data The user data, in this case the server structure */ -static int +int blr_slave_callback(DCB *dcb, DCB_REASON reason, void *data) { ROUTER_SLAVE *slave = (ROUTER_SLAVE *)data; @@ -904,11 +909,19 @@ ROUTER_INSTANCE *router = slave->router; if (reason == DCB_REASON_DRAINED) { - if (slave->state == BLRS_DUMPING && - slave->binlog_pos != router->binlog_position) + if (slave->state == BLRS_DUMPING) { + spinlock_acquire(&slave->catch_lock); + slave->cstate &= ~(CS_UPTODATE|CS_EXPECTCB); + spinlock_release(&slave->catch_lock); slave->stats.n_dcb++; - blr_slave_catchup(router, slave); + blr_slave_catchup(router, slave, true); + } + else + { + LOGIF(LD, (skygw_log_write( + LOGFILE_DEBUG, "Ignored callback due to slave state %s", + blrs_states[slave->state]))); } } @@ -917,7 +930,7 @@ ROUTER_INSTANCE *router = slave->router; if (slave->state == BLRS_DUMPING) { slave->stats.n_cb++; - blr_slave_catchup(router, slave); + blr_slave_catchup(router, slave, true); } else { @@ -931,14 +944,90 @@ ROUTER_INSTANCE *router = slave->router; * Rotate the slave to the new binlog file * * @param slave The slave instance - * @param ptr The rotate event (minux header and OK byte) + * @param ptr The rotate event (minus header and OK byte) */ void blr_slave_rotate(ROUTER_SLAVE *slave, uint8_t *ptr) { +int len = EXTRACT24(ptr + 9); // Extract the event length + + len = len - (19 + 8 + 4); // Remove length of header, checksum and position + if (len > BINLOG_FNAMELEN) + len = BINLOG_FNAMELEN; ptr += 19; // Skip header slave->binlog_pos = extract_field(ptr, 32); slave->binlog_pos += (extract_field(ptr+4, 32) << 32); - memcpy(slave->binlogfile, ptr + 8, BINLOG_FNAMELEN); - slave->binlogfile[BINLOG_FNAMELEN] = 0; + memcpy(slave->binlogfile, ptr + 8, len); + slave->binlogfile[len] = 0; +} + +/** + * Generate an internal rotate event that we can use to cause the slave to move beyond + * a binlog file that is misisng the rotate eent at the end. + * + * @param router The router instance + * @param slave The slave to rotate + * @return Non-zero if the rotate took place + */ +static int +blr_slave_fake_rotate(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave) +{ +char *sptr; +int filenum; +GWBUF *resp; +uint8_t *ptr; +int len, binlognamelen; +REP_HEADER hdr; +uint32_t chksum; + + if ((sptr = strrchr(slave->binlogfile, '.')) == NULL) + return 0; + blr_close_binlog(router, slave->file); + filenum = atoi(sptr + 1); + sprintf(slave->binlogfile, BINLOG_NAMEFMT, router->fileroot, filenum + 1); + slave->binlog_pos = 4; + if ((slave->file = blr_open_binlog(router, slave->binlogfile)) == NULL) + return 0; + + binlognamelen = strlen(slave->binlogfile); + + if (slave->nocrc) + len = 19 + 8 + binlognamelen; + else + len = 19 + 8 + 4 + binlognamelen; + + // Build a fake rotate event + resp = gwbuf_alloc(len + 5); + hdr.payload_len = len + 1; + hdr.seqno = slave->seqno++; + hdr.ok = 0; + hdr.timestamp = 0L; + hdr.event_type = ROTATE_EVENT; + hdr.serverid = router->masterid; + hdr.event_size = len; + hdr.next_pos = 0; + hdr.flags = 0x20; + ptr = blr_build_header(resp, &hdr); + encode_value(ptr, slave->binlog_pos, 64); + ptr += 8; + memcpy(ptr, slave->binlogfile, binlognamelen); + ptr += binlognamelen; + + if (!slave->nocrc) + { + /* + * Now add the CRC to the fake binlog rotate event. + * + * The algorithm is first to compute the checksum of an empty buffer + * and then the checksum of the event portion of the message, ie we do not + * include the length, sequence number and ok byte that makes up the first + * 5 bytes of the message. We also do not include the 4 byte checksum itself. + */ + chksum = crc32(0L, NULL, 0); + chksum = crc32(chksum, GWBUF_DATA(resp) + 5, hdr.event_size - 4); + encode_value(ptr, chksum, 32); + } + + slave->dcb->func.write(slave->dcb, resp); + return 1; } diff --git a/server/modules/routing/debugcmd.c b/server/modules/routing/debugcmd.c index 9f9ac8f36..3b53f4b6b 100644 --- a/server/modules/routing/debugcmd.c +++ b/server/modules/routing/debugcmd.c @@ -41,6 +41,7 @@ * than simply addresses * 23/05/14 Mark Riddoch Added support for developer and user modes * 29/05/14 Mark Riddoch Add Filter support + * 16/10/14 Mark Riddoch Add show eventq * * @endverbatim */ @@ -66,6 +67,8 @@ #include #include #include +#include +#include #include #include @@ -81,6 +84,7 @@ #define ARG_TYPE_DCB 7 #define ARG_TYPE_MONITOR 8 #define ARG_TYPE_FILTER 9 +#define ARG_TYPE_NUMERIC 10 /** * The subcommand structure @@ -117,6 +121,14 @@ struct subcommand showoptions[] = { "Show the poll statistics", "Show the poll statistics", {0, 0, 0} }, + { "eventq", 0, dShowEventQ, + "Show the queue of events waiting to be processed", + "Show the queue of events waiting to be processed", + {0, 0, 0} }, + { "eventstats", 0, dShowEventStats, + "Show the event statistics", + "Show the event statistics", + {0, 0, 0} }, { "filter", 1, dprintFilter, "Show details of a filter, called with a filter name", "Show details of a filter, called with the address of a filter", @@ -161,6 +173,10 @@ struct subcommand showoptions[] = { "Show all active sessions in MaxScale", "Show all active sessions in MaxScale", {0, 0, 0} }, + { "tasks", 0, hkshow_tasks, + "Show all active housekeeper tasks in MaxScale", + "Show all active housekeeper tasks in MaxScale", + {0, 0, 0} }, { "threads", 0, dShowThreads, "Show the status of the polling threads in MaxScale", "Show the status of the polling threads in MaxScale", @@ -282,6 +298,8 @@ struct subcommand restartoptions[] = { }; static void set_server(DCB *dcb, SERVER *server, char *bit); +static void set_pollsleep(DCB *dcb, int); +static void set_nbpoll(DCB *dcb, int); /** * The subcommands of the set command */ @@ -290,6 +308,15 @@ struct subcommand setoptions[] = { "Set the status of a server. E.g. set server dbnode4 master", "Set the status of a server. E.g. set server 0x4838320 master", {ARG_TYPE_SERVER, ARG_TYPE_STRING, 0} }, + { "pollsleep", 1, set_pollsleep, + "Set the maximum poll sleep period in milliseconds", + "Set the maximum poll sleep period in milliseconds", + {ARG_TYPE_NUMERIC, 0, 0} }, + { "nbpolls", 1, set_nbpoll, + "Set the number of non-blocking polls", + "Set the number of non-blocking polls", + {ARG_TYPE_NUMERIC, 0, 0} }, + { NULL, 0, NULL, NULL, NULL, {0, 0, 0} } }; @@ -672,6 +699,16 @@ SERVICE *service; if (mode == CLIM_USER || (rval = (unsigned long)strtol(arg, NULL, 0)) == 0) rval = (unsigned long)filter_find(arg); return rval; + case ARG_TYPE_NUMERIC: + { + int i; + for (i = 0; arg[i]; i++) + { + if (arg[i] < '0' || arg[i] > '9') + return 0; + } + return atoi(arg); + } } return 0; } @@ -1305,6 +1342,30 @@ static void disable_log_action(DCB *dcb, char *arg1) { skygw_log_disable(type); } +/** + * Set the duration of the sleep passed to the poll wait + * + * @param dcb DCB for output + * @param sleeptime Sleep time in milliseconds + */ +static void +set_pollsleep(DCB *dcb, int sleeptime) +{ + poll_set_maxwait(sleeptime); +} + +/** + * Set the number of non-blockign spins to make + * + * @param dcb DCB for output + * @param nb Number of spins + */ +static void +set_nbpoll(DCB *dcb, int nb) +{ + poll_set_nonblocking_polls(nb); +} + #if defined(FAKE_CODE) static void fail_backendfd(void) { diff --git a/server/modules/routing/readwritesplit/readwritesplit.c b/server/modules/routing/readwritesplit/readwritesplit.c index 106df0fdf..b86db4dde 100644 --- a/server/modules/routing/readwritesplit/readwritesplit.c +++ b/server/modules/routing/readwritesplit/readwritesplit.c @@ -1094,7 +1094,6 @@ static bool get_dcb( */ if (master_bref == NULL) { - succp = false; goto return_succp; } #if defined(SS_DEBUG) @@ -1165,30 +1164,20 @@ static bool get_dcb( } /** * If there are no candidates yet accept both master or - * slave. If candidate is master, any slave replaces it. + * slave. */ - else if (candidate_bref == NULL || - (SERVER_IS_MASTER(candidate_bref->bref_backend->backend_server) && - SERVER_IS_SLAVE(b->backend_server))) + else if (candidate_bref == NULL) { /** * Ensure that master has not changed dunring * session and abort if it has. */ - if (SERVER_IS_MASTER(b->backend_server)) + if (SERVER_IS_MASTER(b->backend_server) && + &backend_ref[i] == master_bref) { - if (candidate_bref != master_bref) - { - /** Log master failure */ - succp = false; - break; - } - else - { - /** found master */ - candidate_bref = &backend_ref[i]; - succp = true; - } + /** found master */ + candidate_bref = &backend_ref[i]; + succp = true; } /** * Ensure that max replication lag is not set @@ -1204,6 +1193,20 @@ static bool get_dcb( succp = true; } } + /** + * If candidate is master, any slave which doesn't break + * replication lag limits replaces it. + */ + else if (SERVER_IS_MASTER(candidate_bref->bref_backend->backend_server) && + SERVER_IS_SLAVE(b->backend_server) && + (max_rlag == MAX_RLAG_UNDEFINED || + (b->backend_server->rlag != MAX_RLAG_NOT_AVAILABLE && + b->backend_server->rlag <= max_rlag))) + { + /** found slave */ + candidate_bref = &backend_ref[i]; + succp = true; + } /** * When candidate exists, compare it against the current * backend and update assign it to new candidate if