blr branch merge

blr branch merge
This commit is contained in:
MassimilianoPinto 2014-09-11 12:20:42 +02:00
commit 7245d1baa1
30 changed files with 4649 additions and 90 deletions

View File

@ -52,3 +52,12 @@ endif
#
ERRMSG := $(HOME)/usr/share/mysql
#
# Build a binary that produces profile data
#
PROFILE := N
#
# Build a binary that produces code coverage data
#
GCOV := N

View File

@ -41,3 +41,13 @@ endif
ifdef PROF
CFLAGS := $(CFLAGS) -DSS_PROF
endif
ifeq "$(PROFILE)" "Y"
CFLAGS += -pg
LDFLAGS += -pg
endif
ifeq "$(GCOV)" "Y"
CFLAGS += -fprofile-arcs -ftest-coverage
LIBS += -lgcov
endif

View File

@ -34,6 +34,7 @@
# gateway needs mysql client lib, not qc.
# 24/07/13 Mark Ridoch Addition of encryption routines
# 30/05/14 Mark Ridoch Filter API added
# 29/08/14 Mark Riddoch Added housekeeper
include ../../build_gateway.inc
@ -47,17 +48,23 @@ CFLAGS=-c -I/usr/include -I../include -I../modules/include -I../inih \
-I$(LOGPATH) -I$(UTILSPATH) \
-Wall -g
include ../../makefile.inc
LDFLAGS=-rdynamic -L$(LOGPATH) \
-Wl,-rpath,$(DEST)/lib \
-Wl,-rpath,$(LOGPATH) -Wl,-rpath,$(UTILSPATH) \
-Wl,-rpath,$(EMBEDDED_LIB)
LIBS=-L$(EMBEDDED_LIB) \
-lmysqld \
-lz -lm -lcrypt -lcrypto -ldl -laio -lrt -pthread -llog_manager \
-L../inih/extra -linih -lssl -lstdc++
include ../../makefile.inc
SRCS= atomic.c buffer.c spinlock.c gateway.c \
gw_utils.c utils.c dcb.c load_utils.c session.c service.c server.c \
poll.c config.c users.c hashtable.c dbusers.c thread.c gwbitmask.c \
monitor.c adminusers.c secrets.c filter.c modutil.c
monitor.c adminusers.c secrets.c filter.c modutil.c housekeeper.c
HDRS= ../include/atomic.h ../include/buffer.h ../include/dcb.h \
../include/gw.h ../modules/include/mysql_client_server_protocol.h \
@ -65,18 +72,13 @@ HDRS= ../include/atomic.h ../include/buffer.h ../include/dcb.h \
../include/modules.h ../include/poll.h ../include/config.h \
../include/users.h ../include/hashtable.h ../include/gwbitmask.h \
../include/adminusers.h ../include/version.h ../include/maxscale.h \
../include/filter.h modutil.h
../include/filter.h ../include/modutil.h ../include/housekeeper.h
OBJ=$(SRCS:.c=.o)
KOBJS=maxkeys.o secrets.o utils.o
POBJS=maxpasswd.o secrets.o utils.o
LIBS=-L$(EMBEDDED_LIB) \
-lmysqld \
-lz -lm -lcrypt -lcrypto -ldl -laio -lrt -pthread -llog_manager \
-L../inih/extra -linih -lssl -lstdc++
all: maxscale maxkeys maxpasswd
cleantests:

View File

@ -32,6 +32,8 @@
* 11/07/13 Mark Riddoch Add reference count mechanism
* 16/07/2013 Massimiliano Pinto Added command type to gwbuf struct
* 24/06/2014 Mark Riddoch Addition of gwbuf_trim
* 28/08/2014 Mark Riddoch Adition of tail pointer to speed
* the gwbuf_append process
*
* @endverbatim
*/
@ -82,6 +84,7 @@ SHARED_BUF *sbuf;
sbuf->refcount = 1;
rval->sbuf = sbuf;
rval->next = NULL;
rval->tail = rval;
rval->gwbuf_type = GWBUF_TYPE_UNDEFINED;
rval->command = 0;
CHK_GWBUF(rval);
@ -131,6 +134,7 @@ GWBUF *rval;
rval->end = buf->end;
rval->gwbuf_type = buf->gwbuf_type;
rval->next = NULL;
rval->tail = rval;
CHK_GWBUF(rval);
return rval;
}
@ -157,6 +161,7 @@ GWBUF *gwbuf_clone_portion(
clonebuf->end = (void *)((char *)clonebuf->start)+length;
clonebuf->gwbuf_type = buf->gwbuf_type; /*< clone the type for now */
clonebuf->next = NULL;
clonebuf->tail = clonebuf;
CHK_GWBUF(clonebuf);
return clonebuf;
@ -233,11 +238,8 @@ GWBUF *ptr = head;
if (!head)
return tail;
CHK_GWBUF(head);
while (ptr->next)
{
ptr = ptr->next;
}
ptr->next = tail;
head->tail->next = tail;
head->tail = tail->tail;
return head;
}
@ -262,6 +264,7 @@ GWBUF *
gwbuf_consume(GWBUF *head, unsigned int length)
{
GWBUF *rval = head;
CHK_GWBUF(head);
GWBUF_CONSUME(head, length);
CHK_GWBUF(head);
@ -269,8 +272,13 @@ GWBUF *rval = head;
if (GWBUF_EMPTY(head))
{
rval = head->next;
if (head->next)
head->next->tail = head->tail;
gwbuf_free(head);
}
ss_dassert(rval->end > rval->start);
return rval;
}
@ -302,6 +310,8 @@ int rval = 0;
* buffer has n_bytes or less then it will be freed and
* NULL will be returned.
*
* This routine assumes the buffer is not part of a chain
*
* @param buf The buffer to trim
* @param n_bytes The number of bytes to trim off
* @return The buffer chain or NULL if buffer has <= n_bytes
@ -309,6 +319,8 @@ int rval = 0;
GWBUF *
gwbuf_trim(GWBUF *buf, unsigned int n_bytes)
{
ss_dassert(buf->next == NULL);
if (GWBUF_LENGTH(buf) <= n_bytes)
{
gwbuf_consume(buf, GWBUF_LENGTH(buf));

View File

@ -923,6 +923,15 @@ config_threadcount()
return gateway.n_threads;
}
static struct {
char *logname;
logfile_id_t logfile;
} lognames[] = {
{ "log_messages", LOGFILE_MESSAGE },
{ "log_trace", LOGFILE_TRACE },
{ "log_debug", LOGFILE_DEBUG },
{ NULL, 0 }
};
/**
* Configuration handler for items in the global [MaxScale] section
*
@ -933,10 +942,20 @@ config_threadcount()
static int
handle_global_item(const char *name, const char *value)
{
int i;
if (strcmp(name, "threads") == 0) {
gateway.n_threads = atoi(value);
} else {
return 0;
for (i = 0; lognames[i].logname; i++)
{
if (strcasecmp(name, lognames[i].logname) == 0)
{
if (atoi(value))
skygw_log_enable(lognames[i].logfile);
else
skygw_log_disable(lognames[i].logfile);
}
}
}
return 1;
}

View File

@ -89,7 +89,13 @@ static int dcb_null_write(DCB *dcb, GWBUF *buf);
static int dcb_null_close(DCB *dcb);
static int dcb_null_auth(DCB *dcb, SERVER *server, SESSION *session, GWBUF *buf);
DCB* dcb_get_zombies(void)
/**
* Return the pointer to the lsit of zombie DCB's
*
* @return Zombies DCB list
*/
DCB *
dcb_get_zombies(void)
{
return zombies;
}
@ -128,6 +134,12 @@ DCB *rval;
spinlock_init(&rval->delayqlock);
spinlock_init(&rval->authlock);
spinlock_init(&rval->cb_lock);
spinlock_init(&rval->pollinlock);
spinlock_init(&rval->polloutlock);
rval->pollinbusy = 0;
rval->readcheck = 0;
rval->polloutbusy = 0;
rval->writecheck = 0;
rval->fd = -1;
memset(&rval->stats, 0, sizeof(DCBSTATS)); // Zero the statistics
rval->state = DCB_STATE_ALLOC;
@ -376,11 +388,6 @@ DCB_CALLBACK *cb;
}
spinlock_release(&dcb->cb_lock);
if (dcb->dcb_readqueue)
{
GWBUF* queue = dcb->dcb_readqueue;
while ((queue = gwbuf_consume(queue, GWBUF_LENGTH(queue))) != NULL);
}
bitmask_free(&dcb->memdata.bitmask);
simple_mutex_done(&dcb->dcb_read_lock);
simple_mutex_done(&dcb->dcb_write_lock);
@ -399,7 +406,7 @@ DCB_CALLBACK *cb;
*
* @param threadid The thread ID of the caller
*/
DCB*
DCB *
dcb_process_zombies(int threadid)
{
DCB *ptr, *lptr;
@ -1187,7 +1194,7 @@ printDCB(DCB *dcb)
if (dcb->remote)
printf("\tConnected to: %s\n", dcb->remote);
if (dcb->user)
printf("\tUsername to: %s\n", dcb->user);
printf("\tUsername to: %s\n", dcb->user);
if (dcb->writeq)
printf("\tQueued write data: %d\n",gwbuf_length(dcb->writeq));
printf("\tStatistics:\n");
@ -1204,6 +1211,19 @@ printDCB(DCB *dcb)
printf("\t\tNo. of Low Water Events: %d\n",
dcb->stats.n_low_water);
}
/**
* Display an entry from the spinlock statistics data
*
* @param dcb The DCB to print to
* @param desc Description of the statistic
* @param value The statistic value
*/
static void
spin_reporter(void *dcb, char *desc, int value)
{
dcb_printf((DCB *)dcb, "\t\t%-35s %d\n", desc, value);
}
/**
* Diagnostic to print all DCB allocated in the system
@ -1233,6 +1253,12 @@ void dprintAllDCBs(DCB *pdcb)
DCB *dcb;
spinlock_acquire(&dcbspin);
#if SPINLOCK_PROFILE
dcb_printf(pdcb, "DCB List Spinlock Statistics:\n");
spinlock_stats(&dcbspin, spin_reporter, pdcb);
dcb_printf(pdcb, "Zombie Queue Lock Statistics:\n");
spinlock_stats(&zombiespin, spin_reporter, pdcb);
#endif
dcb = allDCBs;
while (dcb)
{
@ -1252,12 +1278,16 @@ DCB *dcb;
dcb_printf(pdcb, "\tQueued write data: %d\n",
gwbuf_length(dcb->writeq));
dcb_printf(pdcb, "\tStatistics:\n");
dcb_printf(pdcb, "\t\tNo. of Reads: %d\n", dcb->stats.n_reads);
dcb_printf(pdcb, "\t\tNo. of Writes: %d\n", dcb->stats.n_writes);
dcb_printf(pdcb, "\t\tNo. of Buffered Writes: %d\n", dcb->stats.n_buffered);
dcb_printf(pdcb, "\t\tNo. of Accepts: %d\n", dcb->stats.n_accepts);
dcb_printf(pdcb, "\t\tNo. of High Water Events: %d\n", dcb->stats.n_high_water);
dcb_printf(pdcb, "\t\tNo. of Low Water Events: %d\n", dcb->stats.n_low_water);
dcb_printf(pdcb, "\t\tNo. of Reads: %d\n", dcb->stats.n_reads);
dcb_printf(pdcb, "\t\tNo. of Writes: %d\n", dcb->stats.n_writes);
dcb_printf(pdcb, "\t\tNo. of Buffered Writes: %d\n", dcb->stats.n_buffered);
dcb_printf(pdcb, "\t\tNo. of Accepts: %d\n", dcb->stats.n_accepts);
dcb_printf(pdcb, "\t\tNo. of busy polls: %d\n", dcb->stats.n_busypolls);
dcb_printf(pdcb, "\t\tNo. of read rechecks: %d\n", dcb->stats.n_readrechecks);
dcb_printf(pdcb, "\t\tNo. of busy write polls: %d\n", dcb->stats.n_busywrpolls);
dcb_printf(pdcb, "\t\tNo. of write rechecks: %d\n", dcb->stats.n_writerechecks);
dcb_printf(pdcb, "\t\tNo. of High Water Events: %d\n", dcb->stats.n_high_water);
dcb_printf(pdcb, "\t\tNo. of Low Water Events: %d\n", dcb->stats.n_low_water);
if (dcb->flags & DCBF_CLONE)
dcb_printf(pdcb, "\t\tDCB is a clone.\n");
dcb = dcb->next;
@ -1278,20 +1308,20 @@ DCB *dcb;
spinlock_acquire(&dcbspin);
dcb = allDCBs;
dcb_printf(pdcb, "Descriptor Control Blocks\n");
dcb_printf(pdcb, "------------+----------------------------+----------------------+----------\n");
dcb_printf(pdcb, " %-10s | %-26s | %-20s | %s\n",
dcb_printf(pdcb, "------------------+----------------------------+--------------------+----------\n");
dcb_printf(pdcb, " %-16s | %-26s | %-18s | %s\n",
"DCB", "State", "Service", "Remote");
dcb_printf(pdcb, "------------+----------------------------+----------------------+----------\n");
dcb_printf(pdcb, "------------------+----------------------------+--------------------+----------\n");
while (dcb)
{
dcb_printf(pdcb, " %10p | %-26s | %-20s | %s\n",
dcb_printf(pdcb, " %-16p | %-26s | %-18s | %s\n",
dcb, gw_dcb_state2string(dcb->state),
(dcb->session->service ?
dcb->session->service->name : ""),
((dcb->session && dcb->session->service) ? dcb->session->service->name : ""),
(dcb->remote ? dcb->remote : ""));
dcb = dcb->next;
}
dcb_printf(pdcb, "------------+----------------------------+----------------------+----------\n\n");
dcb_printf(pdcb, "------------------+----------------------------+--------------------+----------\n\n");
spinlock_release(&dcbspin);
}
@ -1308,16 +1338,16 @@ DCB *dcb;
spinlock_acquire(&dcbspin);
dcb = allDCBs;
dcb_printf(pdcb, "Client Connections\n");
dcb_printf(pdcb, "-----------------+------------+----------------------+------------\n");
dcb_printf(pdcb, " %-15s | %-10s | %-20s | %s\n",
dcb_printf(pdcb, "-----------------+------------------+----------------------+------------\n");
dcb_printf(pdcb, " %-15s | %-16s | %-20s | %s\n",
"Client", "DCB", "Service", "Session");
dcb_printf(pdcb, "-----------------+------------+----------------------+------------\n");
dcb_printf(pdcb, "-----------------+------------------+----------------------+------------\n");
while (dcb)
{
if (dcb_isclient(dcb)
&& dcb->dcb_role == DCB_ROLE_REQUEST_HANDLER)
{
dcb_printf(pdcb, " %-15s | %10p | %-20s | %10p\n",
dcb_printf(pdcb, " %-15s | %16p | %-20s | %10p\n",
(dcb->remote ? dcb->remote : ""),
dcb, (dcb->session->service ?
dcb->session->service->name : ""),
@ -1325,7 +1355,7 @@ DCB *dcb;
}
dcb = dcb->next;
}
dcb_printf(pdcb, "-----------------+------------+----------------------+------------\n\n");
dcb_printf(pdcb, "-----------------+------------------+----------------------+------------\n\n");
spinlock_release(&dcbspin);
}
@ -1342,16 +1372,18 @@ dprintDCB(DCB *pdcb, DCB *dcb)
dcb_printf(pdcb, "DCB: %p\n", (void *)dcb);
dcb_printf(pdcb, "\tDCB state: %s\n", gw_dcb_state2string(dcb->state));
if (dcb->session && dcb->session->service)
dcb_printf(pdcb, "\tService: %s\n",
dcb_printf(pdcb, "\tService: %s\n",
dcb->session->service->name);
if (dcb->remote)
dcb_printf(pdcb, "\tConnected to: %s\n", dcb->remote);
if (dcb->user)
dcb_printf(pdcb, "\tUsername: %s\n",
dcb_printf(pdcb, "\tUsername: %s\n",
dcb->user);
dcb_printf(pdcb, "\tOwning Session: %p\n", dcb->session);
if (dcb->writeq)
dcb_printf(pdcb, "\tQueued write data: %d\n", gwbuf_length(dcb->writeq));
if (dcb->delayq)
dcb_printf(pdcb, "\tDelayed write data: %d\n", gwbuf_length(dcb->delayq));
dcb_printf(pdcb, "\tStatistics:\n");
dcb_printf(pdcb, "\t\tNo. of Reads: %d\n",
dcb->stats.n_reads);
@ -1361,12 +1393,30 @@ dprintDCB(DCB *pdcb, DCB *dcb)
dcb->stats.n_buffered);
dcb_printf(pdcb, "\t\tNo. of Accepts: %d\n",
dcb->stats.n_accepts);
dcb_printf(pdcb, "\t\tNo. of busy polls: %d\n", dcb->stats.n_busypolls);
dcb_printf(pdcb, "\t\tNo. of read rechecks: %d\n", dcb->stats.n_readrechecks);
dcb_printf(pdcb, "\t\tNo. of busy write polls: %d\n", dcb->stats.n_busywrpolls);
dcb_printf(pdcb, "\t\tNo. of write rechecks: %d\n", dcb->stats.n_writerechecks);
dcb_printf(pdcb, "\t\tNo. of High Water Events: %d\n",
dcb->stats.n_high_water);
dcb_printf(pdcb, "\t\tNo. of Low Water Events: %d\n",
dcb->stats.n_low_water);
if (dcb->flags & DCBF_CLONE)
dcb_printf(pdcb, "\t\tDCB is a clone.\n");
#if SPINLOCK_PROFILE
dcb_printf(pdcb, "\tInitlock Statistics:\n");
spinlock_stats(&dcb->dcb_initlock, spin_reporter, pdcb);
dcb_printf(pdcb, "\tWrite Queue Lock Statistics:\n");
spinlock_stats(&dcb->writeqlock, spin_reporter, pdcb);
dcb_printf(pdcb, "\tDelay Queue Lock Statistics:\n");
spinlock_stats(&dcb->delayqlock, spin_reporter, pdcb);
dcb_printf(pdcb, "\tPollin Lock Statistics:\n");
spinlock_stats(&dcb->pollinlock, spin_reporter, pdcb);
dcb_printf(pdcb, "\tPollout Lock Statistics:\n");
spinlock_stats(&dcb->polloutlock, spin_reporter, pdcb);
dcb_printf(pdcb, "\tCallback Lock Statistics:\n");
spinlock_stats(&dcb->cb_lock, spin_reporter, pdcb);
#endif
}
/**
@ -1719,10 +1769,7 @@ int gw_write(
* @return Non-zero (true) if the callback was added
*/
int
dcb_add_callback(
DCB *dcb,
DCB_REASON reason,
int (*callback)(struct dcb *, DCB_REASON, void *), void *userdata)
dcb_add_callback(DCB *dcb, DCB_REASON reason, int (*callback)(struct dcb *, DCB_REASON, void *), void *userdata)
{
DCB_CALLBACK *cb, *ptr;
int rval = 1;
@ -1754,7 +1801,10 @@ int rval = 1;
return 0;
}
if (cb->next == NULL)
{
cb->next = ptr;
break;
}
cb = cb->next;
}
spinlock_release(&dcb->cb_lock);
@ -1775,7 +1825,7 @@ int rval = 1;
* @return Non-zero (true) if the callback was removed
*/
int
dcb_remove_callback(DCB *dcb, DCB_REASON reason, int (*callback)(struct dcb *, DCB_REASON), void *userdata)
dcb_remove_callback(DCB *dcb, DCB_REASON reason, int (*callback)(struct dcb *, DCB_REASON, void *), void *userdata)
{
DCB_CALLBACK *cb, *pcb = NULL;
int rval = 0;
@ -1868,8 +1918,102 @@ int rval = 0;
return rval;
}
static DCB* dcb_get_next (
DCB* dcb)
/**
* Called by the EPOLLIN event. Take care of calling the protocol
* read entry point and managing multiple threads competing for the DCB
* without blocking those threads.
*
* This mechanism does away with the need for a mutex on the EPOLLIN event
* and instead implements a queuing mechanism in which nested events are
* queued on the DCB such that when the thread processing the first event
* returns it will read the queued event and process it. This allows the
* thread that woudl otherwise have to wait to process the nested event
* to return immediately and and process other events.
*
* @param dcb The DCB that has data available
*/
void
dcb_pollin(DCB *dcb, int thread_id)
{
spinlock_acquire(&dcb->pollinlock);
if (dcb->pollinbusy == 0)
{
dcb->pollinbusy = 1;
do {
if (dcb->readcheck)
{
dcb->stats.n_readrechecks++;
dcb_process_zombies(thread_id);
}
dcb->readcheck = 0;
spinlock_release(&dcb->pollinlock);
dcb->func.read(dcb);
spinlock_acquire(&dcb->pollinlock);
} while (dcb->readcheck);
dcb->pollinbusy = 0;
}
else
{
dcb->stats.n_busypolls++;
dcb->readcheck = 1;
}
spinlock_release(&dcb->pollinlock);
}
/**
* Called by the EPOLLOUT event. Take care of calling the protocol
* write_ready entry point and managing multiple threads competing for the DCB
* without blocking those threads.
*
* This mechanism does away with the need for a mutex on the EPOLLOUT event
* and instead implements a queuing mechanism in which nested events are
* queued on the DCB such that when the thread processing the first event
* returns it will read the queued event and process it. This allows the
* thread that would otherwise have to wait to process the nested event
* to return immediately and and process other events.
*
* @param dcb The DCB thats available for writes
*/
void
dcb_pollout(DCB *dcb, int thread_id)
{
spinlock_acquire(&dcb->polloutlock);
if (dcb->polloutbusy == 0)
{
dcb->polloutbusy = 1;
do {
if (dcb->writecheck)
{
dcb_process_zombies(thread_id);
dcb->stats.n_writerechecks++;
}
dcb->writecheck = 0;
spinlock_release(&dcb->polloutlock);
dcb->func.write_ready(dcb);
spinlock_acquire(&dcb->polloutlock);
} while (dcb->writecheck);
dcb->polloutbusy = 0;
}
else
{
dcb->stats.n_busywrpolls++;
dcb->writecheck = 1;
}
spinlock_release(&dcb->polloutlock);
}
/**
* Get the next DCB in the list of all DCB's
*
* @param dcb The current DCB
* @return The pointer to the next DCB or NULL if this is the last
*/
static DCB *
dcb_get_next (DCB* dcb)
{
DCB* p;
@ -1903,8 +2047,13 @@ static DCB* dcb_get_next (
return dcb;
}
void dcb_call_foreach (
DCB_REASON reason)
/**
* Call all the callbacks on all DCB's that match the reason given
*
* @param reason The DCB_REASON that triggers the callback
*/
void
dcb_call_foreach(DCB_REASON reason)
{
switch (reason) {
case DCB_REASON_CLOSE:

View File

@ -51,6 +51,7 @@
#include <modules.h>
#include <config.h>
#include <poll.h>
#include <housekeeper.h>
#include <stdlib.h>
#include <unistd.h>
@ -1510,6 +1511,12 @@ int main(int argc, char **argv)
log_flush_thr = thread_start(
log_flush_cb,
(void *)&log_flush_timeout_ms);
/*
* Start the housekeeper thread
*/
hkinit();
/*<
* Start the polling threads, note this is one less than is
* configured as the main thread will also poll.

View File

@ -28,7 +28,7 @@
* and value and to free them.
*
* The hashtable is arrange as a set of linked lists, the number of linked
* lists beign the hashsize as requested by the user. Entries are hashed by
* lists being the hashsize as requested by the user. Entries are hashed by
* calling the hash function that is passed in by the user, this is used as
* an index into the array of linked lists, usign modulo hashsize.
*

195
server/core/housekeeper.c Normal file
View File

@ -0,0 +1,195 @@
/*
* This file is distributed as part of the SkySQL Gateway. It is free
* software: you can redistribute it and/or modify it under the terms of the
* GNU General Public License as published by the Free Software Foundation,
* version 2.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along with
* this program; if not, write to the Free Software Foundation, Inc., 51
* Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Copyright SkySQL Ab 2014
*/
#include <stdlib.h>
#include <housekeeper.h>
#include <thread.h>
#include <spinlock.h>
/**
* @file housekeeper.c Provide a mechanism to run periodic tasks
*
* @verbatim
* Revision History
*
* Date Who Description
* 29/08/14 Mark Riddoch Initial implementation
*
* @endverbatim
*/
/**
* List of all tasks that need to be run
*/
static HKTASK *tasks = NULL;
/**
* Spinlock to protect the tasks list
*/
static SPINLOCK tasklock = SPINLOCK_INIT;
static void hkthread(void *);
/**
* Initialise the housekeeper thread
*/
void
hkinit()
{
thread_start(hkthread, NULL);
}
/**
* Add a new task to the housekeepers lists of tasks that should be
* run periodically.
*
* The task will be first run frequency seconds after this call is
* made and will the be executed repeatedly every frequency seconds
* until the task is removed.
*
* Task names must be unique.
*
* @param name The unique name for this housekeeper task
* @param taskfn The function to call for the task
* @param data Data to pass to the task function
* @param frequency How often to run the task, expressed in seconds
* @return Return the tiem in seconds when the task will be first run if the task was added, otherwise 0
*/
int
hktask_add(char *name, void (*taskfn)(void *), void *data, int frequency)
{
HKTASK *task, *ptr;
if ((task = (HKTASK *)malloc(sizeof(HKTASK))) == NULL)
{
return 0;
}
if ((task->name = strdup(name)) == NULL)
{
free(task);
return 0;
}
task->task = taskfn;
task->data = data;
task->frequency = frequency;
task->nextdue = time(0) + frequency;
task->next = NULL;
spinlock_acquire(&tasklock);
ptr = tasks;
while (ptr && ptr->next)
{
if (strcmp(ptr->name, name) == 0)
{
spinlock_release(&tasklock);
free(task->name);
free(task);
return 0;
}
ptr = ptr->next;
}
if (ptr)
ptr->next = task;
else
tasks = task;
spinlock_release(&tasklock);
return task->nextdue;
}
/**
* Remove a named task from the housekeepers task list
*
* @param name The task name to remove
* @return Returns 0 if the task could not be removed
*/
int
hktask_remove(char *name)
{
HKTASK *ptr, *lptr = NULL;
spinlock_acquire(&tasklock);
ptr = tasks;
while (ptr && strcmp(ptr->name, name) != 0)
{
lptr = ptr;
ptr = ptr->next;
}
if (ptr && lptr)
lptr->next = ptr->next;
else if (ptr)
tasks = ptr->next;
spinlock_release(&tasklock);
if (ptr)
{
free(ptr->name);
free(ptr);
return 1;
}
else
{
return 0;
}
}
/**
* The housekeeper thread implementation.
*
* This function is responsible for executing the housekeeper tasks.
*
* The implementation of the callng of the task functions is such that
* the tasks are called without the tasklock spinlock being held. This
* allows manipulation of the housekeeper task list during execution of
* one of the tasks. The resutl is that upon completion of a task the
* search for tasks to run must restart from the start of the queue.
* It is vital that the task->nextdue tiem is updated before the task
* is run.
*
* @param data Unused, here to satisfy the thread system
*/
void
hkthread(void *data)
{
HKTASK *ptr;
time_t now;
void (*taskfn)(void *);
void *taskdata;
for (;;)
{
thread_millisleep(1000);
now = time(0);
spinlock_acquire(&tasklock);
ptr = tasks;
while (ptr)
{
if (ptr->nextdue <= now)
{
ptr->nextdue = now + ptr->frequency;
taskfn = ptr->task;
taskdata = ptr->data;
spinlock_release(&tasklock);
(*taskfn)(taskdata);
spinlock_acquire(&tasklock);
ptr = tasks;
}
else
ptr = ptr->next;
}
spinlock_release(&tasklock);
}
}

View File

@ -28,6 +28,8 @@
#include <skygw_utils.h>
#include <log_manager.h>
#include <gw.h>
#include <config.h>
#include <housekeeper.h>
extern int lm_enabled_logfiles_bitmask;
@ -41,14 +43,63 @@ extern int lm_enabled_logfiles_bitmask;
* 19/06/13 Mark Riddoch Initial implementation
* 28/06/13 Mark Riddoch Added poll mask support and DCB
* zombie management
* 29/08/14 Mark Riddoch Addition of thread status data, load average
* etc.
*
* @endverbatim
*/
static int epoll_fd = -1; /*< The epoll file descriptor */
static int do_shutdown = 0; /*< Flag the shutdown of the poll subsystem */
static int do_shutdown = 0; /*< Flag the shutdown of the poll subsystem */
static GWBITMASK poll_mask;
static simple_mutex_t epoll_wait_mutex; /*< serializes calls to epoll_wait */
static int n_waiting = 0; /*< No. of threads in epoll_wait */
/**
* Thread load average, this is the average number of descriptors in each
* poll completion, a value of 1 or less is the ideal.
*/
static double load_average = 0.0;
static int load_samples = 0;
static int load_nfds = 0;
static double current_avg = 0.0;
static double *avg_samples = NULL;
static int next_sample = 0;
static int n_avg_samples;
/* Thread statistics data */
static int n_threads; /*< No. of threads */
/**
* Internal MaxScale thread states
*/
typedef enum { THREAD_STOPPED, THREAD_IDLE,
THREAD_POLLING, THREAD_PROCESSING,
THREAD_ZPROCESSING } THREAD_STATE;
/**
* Thread data used to report the current state and activity related to
* a thread
*/
typedef struct {
THREAD_STATE state; /*< Current thread state */
int n_fds; /*< No. of descriptors thread is processing */
DCB *cur_dcb; /*< Current DCB being processed */
uint32_t event; /*< Current event being processed */
} THREAD_DATA;
static THREAD_DATA *thread_data = NULL; /*< Status of each thread */
/**
* The number of buckets used to gather statistics about how many
* descriptors where processed on each epoll completion.
*
* An array of wakeup counts is created, with the number of descriptors used
* to index that array. Each time a completion occurs the n_fds - 1 value is
* used to index this array and increment the count held there.
* If n_fds - 1 >= MAXFDS then the count at MAXFDS -1 is incremented.
*/
#define MAXNFDS 10
/**
* The polling statistics
@ -60,8 +111,20 @@ static struct {
int n_hup; /*< Number of hangup events */
int n_accept; /*< Number of accept events */
int n_polls; /*< Number of poll cycles */
int n_nothreads; /*< Number of times no threads are polling */
int n_fds[MAXNFDS]; /*< Number of wakeups with particular
n_fds value */
} pollStats;
/**
* How frequently to call the poll_loadav function used to monitor the load
* average of the poll subsystem.
*/
#define POLL_LOAD_FREQ 10
/**
* Periodic function to collect load data for average calculations
*/
static void poll_loadav(void *);
/**
* Initialise the polling system we are using for the gateway.
@ -71,6 +134,8 @@ static struct {
void
poll_init()
{
int i;
if (epoll_fd != -1)
return;
if ((epoll_fd = epoll_create(MAX_EVENTS)) == -1)
@ -80,7 +145,23 @@ poll_init()
}
memset(&pollStats, 0, sizeof(pollStats));
bitmask_init(&poll_mask);
n_threads = config_threadcount();
if ((thread_data =
(THREAD_DATA *)malloc(n_threads * sizeof(THREAD_DATA))) != NULL)
{
for (i = 0; i < n_threads; i++)
{
thread_data[i].state = THREAD_STOPPED;
}
}
simple_mutex_init(&epoll_wait_mutex, "epoll_wait_mutex");
hktask_add("Load Average", poll_loadav, NULL, POLL_LOAD_FREQ);
n_avg_samples = 15 * 60 / POLL_LOAD_FREQ;
avg_samples = (double *)malloc(sizeof(double *) * n_avg_samples);
for (i = 0; i < n_avg_samples; i++)
avg_samples[i] = 0.0;
}
/**
@ -100,7 +181,7 @@ poll_add_dcb(DCB *dcb)
CHK_DCB(dcb);
ev.events = EPOLLIN | EPOLLOUT | EPOLLET;
ev.events = EPOLLIN | EPOLLOUT | EPOLLRDHUP | EPOLLHUP | EPOLLET;
ev.data.ptr = dcb;
/*<
@ -245,20 +326,26 @@ return_rc:
void
poll_waitevents(void *arg)
{
struct epoll_event events[MAX_EVENTS];
int i, nfds;
int thread_id = (int)arg;
bool no_op = false;
static bool process_zombies_only = false; /*< flag for all threads */
DCB *zombies = NULL;
struct epoll_event events[MAX_EVENTS];
int i, nfds;
int thread_id = (int)arg;
bool no_op = false;
static bool process_zombies_only = false; /*< flag for all threads */
DCB *zombies = NULL;
/* Add this thread to the bitmask of running polling threads */
bitmask_set(&poll_mask, thread_id);
if (thread_data)
{
thread_data[thread_id].state = THREAD_IDLE;
}
while (1)
{
atomic_add(&n_waiting, 1);
#if BLOCKINGPOLL
nfds = epoll_wait(epoll_fd, events, MAX_EVENTS, -1);
atomic_add(&n_waiting, -1);
#else /* BLOCKINGPOLL */
if (!no_op) {
LOGIF(LD, (skygw_log_write(
@ -272,9 +359,14 @@ poll_waitevents(void *arg)
#if 0
simple_mutex_lock(&epoll_wait_mutex, TRUE);
#endif
if (thread_data)
{
thread_data[thread_id].state = THREAD_POLLING;
}
if ((nfds = epoll_wait(epoll_fd, events, MAX_EVENTS, 0)) == -1)
{
atomic_add(&n_waiting, -1);
int eno = errno;
errno = 0;
LOGIF(LD, (skygw_log_write(
@ -288,6 +380,7 @@ poll_waitevents(void *arg)
}
else if (nfds == 0)
{
atomic_add(&n_waiting, -1);
if (process_zombies_only) {
#if 0
simple_mutex_unlock(&epoll_wait_mutex);
@ -310,6 +403,13 @@ poll_waitevents(void *arg)
}
}
}
else
{
atomic_add(&n_waiting, -1);
}
if (n_waiting == 0)
atomic_add(&pollStats.n_nothreads, 1);
#if 0
simple_mutex_unlock(&epoll_wait_mutex);
#endif
@ -322,6 +422,20 @@ poll_waitevents(void *arg)
pthread_self(),
nfds)));
atomic_add(&pollStats.n_polls, 1);
if (thread_data)
{
thread_data[thread_id].n_fds = nfds;
thread_data[thread_id].cur_dcb = NULL;
thread_data[thread_id].event = 0;
thread_data[thread_id].state = THREAD_PROCESSING;
}
pollStats.n_fds[(nfds < MAXNFDS ? (nfds - 1) : MAXNFDS - 1)]++;
load_average = (load_average * load_samples + nfds)
/ (load_samples + 1);
atomic_add(&load_samples, 1);
atomic_add(&load_nfds, nfds);
for (i = 0; i < nfds; i++)
{
@ -329,6 +443,11 @@ poll_waitevents(void *arg)
__uint32_t ev = events[i].events;
CHK_DCB(dcb);
if (thread_data)
{
thread_data[thread_id].cur_dcb = dcb;
thread_data[thread_id].event = ev;
}
#if defined(SS_DEBUG)
if (dcb_fake_write_ev[dcb->fd] != 0) {
@ -364,6 +483,7 @@ poll_waitevents(void *arg)
eno = gw_getsockerrno(dcb->fd);
if (eno == 0) {
#if MUTEX_BLOCK
simple_mutex_lock(
&dcb->dcb_write_lock,
true);
@ -378,6 +498,11 @@ poll_waitevents(void *arg)
dcb->dcb_write_active = FALSE;
simple_mutex_unlock(
&dcb->dcb_write_lock);
#else
atomic_add(&pollStats.n_write,
1);
dcb_pollout(dcb, thread_id);
#endif
} else {
LOGIF(LD, (skygw_log_write(
LOGFILE_DEBUG,
@ -393,11 +518,13 @@ poll_waitevents(void *arg)
}
if (ev & EPOLLIN)
{
#if MUTEX_BLOCK
simple_mutex_lock(&dcb->dcb_read_lock,
true);
ss_info_dassert(!dcb->dcb_read_active,
"Read already active");
dcb->dcb_read_active = TRUE;
#endif
if (dcb->state == DCB_STATE_LISTENING)
{
@ -421,11 +548,17 @@ poll_waitevents(void *arg)
dcb,
dcb->fd)));
atomic_add(&pollStats.n_read, 1);
#if MUTEX_BLOCK
dcb->func.read(dcb);
#else
dcb_pollin(dcb, thread_id);
#endif
}
#if MUTEX_BLOCK
dcb->dcb_read_active = FALSE;
simple_mutex_unlock(
&dcb->dcb_read_lock);
#endif
}
if (ev & EPOLLERR)
{
@ -475,10 +608,33 @@ poll_waitevents(void *arg)
atomic_add(&pollStats.n_hup, 1);
dcb->func.hangup(dcb);
}
if (ev & EPOLLRDHUP)
{
int eno = 0;
eno = gw_getsockerrno(dcb->fd);
LOGIF(LD, (skygw_log_write(
LOGFILE_DEBUG,
"%lu [poll_waitevents] "
"EPOLLRDHUP on dcb %p, fd %d. "
"Errno %d, %s.",
pthread_self(),
dcb,
dcb->fd,
eno,
strerror(eno))));
atomic_add(&pollStats.n_hup, 1);
dcb->func.hangup(dcb);
}
} /*< for */
no_op = FALSE;
}
process_zombies:
if (thread_data)
{
thread_data[thread_id].state = THREAD_ZPROCESSING;
}
zombies = dcb_process_zombies(thread_id);
if (zombies == NULL) {
@ -491,9 +647,17 @@ poll_waitevents(void *arg)
* Remove the thread from the bitmask of running
* polling threads.
*/
if (thread_data)
{
thread_data[thread_id].state = THREAD_STOPPED;
}
bitmask_clear(&poll_mask, thread_id);
return;
}
if (thread_data)
{
thread_data[thread_id].state = THREAD_IDLE;
}
} /*< while(1) */
}
@ -525,10 +689,194 @@ poll_bitmask()
void
dprintPollStats(DCB *dcb)
{
dcb_printf(dcb, "Number of epoll cycles: %d\n", pollStats.n_polls);
dcb_printf(dcb, "Number of read events: %d\n", pollStats.n_read);
dcb_printf(dcb, "Number of write events: %d\n", pollStats.n_write);
dcb_printf(dcb, "Number of error events: %d\n", pollStats.n_error);
dcb_printf(dcb, "Number of hangup events: %d\n", pollStats.n_hup);
dcb_printf(dcb, "Number of accept events: %d\n", pollStats.n_accept);
int i;
dcb_printf(dcb, "Number of epoll cycles: %d\n",
pollStats.n_polls);
dcb_printf(dcb, "Number of read events: %d\n",
pollStats.n_read);
dcb_printf(dcb, "Number of write events: %d\n",
pollStats.n_write);
dcb_printf(dcb, "Number of error events: %d\n",
pollStats.n_error);
dcb_printf(dcb, "Number of hangup events: %d\n",
pollStats.n_hup);
dcb_printf(dcb, "Number of accept events: %d\n",
pollStats.n_accept);
dcb_printf(dcb, "Number of times no threads polling: %d\n",
pollStats.n_nothreads);
dcb_printf(dcb, "No of poll completions with descriptors\n");
dcb_printf(dcb, "\tNo. of descriptors\tNo. of poll completions.\n");
for (i = 0; i < MAXNFDS - 1; i++)
{
dcb_printf(dcb, "\t%2d\t\t\t%d\n", i + 1, pollStats.n_fds[i]);
}
dcb_printf(dcb, "\t>= %d\t\t\t%d\n", MAXNFDS,
pollStats.n_fds[MAXNFDS-1]);
}
/**
* Convert an EPOLL event mask into a printable string
*
* @param event The event mask
* @return A string representation, the caller must free the string
*/
static char *
event_to_string(uint32_t event)
{
char *str;
str = malloc(22); // 22 is max returned string length
if (str == NULL)
return NULL;
*str = 0;
if (event & EPOLLIN)
{
strcat(str, "IN");
}
if (event & EPOLLOUT)
{
if (*str)
strcat(str, "|");
strcat(str, "OUT");
}
if (event & EPOLLERR)
{
if (*str)
strcat(str, "|");
strcat(str, "ERR");
}
if (event & EPOLLHUP)
{
if (*str)
strcat(str, "|");
strcat(str, "HUP");
}
if (event & EPOLLRDHUP)
{
if (*str)
strcat(str, "|");
strcat(str, "RDHUP");
}
return str;
}
/**
* Print the thread status for all the polling threads
*
* @param dcb The DCB to send the thread status data
*/
void
dShowThreads(DCB *dcb)
{
int i, j, n;
char *state;
double avg1 = 0.0, avg5 = 0.0, avg15 = 0.0;
dcb_printf(dcb, "Polling Threads.\n\n");
dcb_printf(dcb, "Historic Thread Load Average: %.2f.\n", load_average);
dcb_printf(dcb, "Current Thread Load Average: %.2f.\n", current_avg);
/* Average all the samples to get the 15 minute average */
for (i = 0; i < n_avg_samples; i++)
avg15 += avg_samples[i];
avg15 = avg15 / n_avg_samples;
/* Average the last third of the samples to get the 5 minute average */
n = 5 * 60 / POLL_LOAD_FREQ;
i = next_sample - (n + 1);
if (i < 0)
i += n_avg_samples;
for (j = i; j < i + n; j++)
avg5 += avg_samples[j % n_avg_samples];
avg5 = (3 * avg5) / (n_avg_samples);
/* Average the last 15th of the samples to get the 1 minute average */
n = 60 / POLL_LOAD_FREQ;
i = next_sample - (n + 1);
if (i < 0)
i += n_avg_samples;
for (j = i; j < i + n; j++)
avg1 += avg_samples[j % n_avg_samples];
avg1 = (15 * avg1) / (n_avg_samples);
dcb_printf(dcb, "15 Minute Average: %.2f, 5 Minute Average: %.2f, "
"1 Minute Average: %.2f\n\n", avg15, avg5, avg1);
if (thread_data == NULL)
return;
dcb_printf(dcb, " ID | State | # fds | Descriptor | Event\n");
dcb_printf(dcb, "----+------------+--------+------------------+---------------\n");
for (i = 0; i < n_threads; i++)
{
switch (thread_data[i].state)
{
case THREAD_STOPPED:
state = "Stopped";
break;
case THREAD_IDLE:
state = "Idle";
break;
case THREAD_POLLING:
state = "Polling";
break;
case THREAD_PROCESSING:
state = "Processing";
break;
case THREAD_ZPROCESSING:
state = "Collecting";
break;
}
if (thread_data[i].state != THREAD_PROCESSING)
dcb_printf(dcb,
" %2d | %-10s | | |\n",
i, state);
else if (thread_data[i].cur_dcb == NULL)
dcb_printf(dcb,
" %2d | %-10s | %6d | |\n",
i, state, thread_data[i].n_fds);
else
{
char *event_string
= event_to_string(thread_data[i].event);
if (event_string == NULL)
event_string = "??";
dcb_printf(dcb,
" %2d | %-10s | %6d | %-16p | %s\n",
i, state, thread_data[i].n_fds,
thread_data[i].cur_dcb, event_string);
free(event_string);
}
}
}
/**
* The function used to calculate time based load data. This is called by the
* housekeeper every POLL_LOAD_FREQ seconds.
*
* @param data Argument required by the housekeeper but not used here
*/
static void
poll_loadav(void *data)
{
static int last_samples = 0, last_nfds = 0;
int new_samples, new_nfds;
new_samples = load_samples - last_samples;
new_nfds = load_nfds - last_nfds;
last_samples = load_samples;
last_nfds = load_nfds;
/* POLL_LOAD_FREQ average is... */
if (new_samples)
current_avg = new_nfds / new_samples;
else
current_avg = 0.0;
avg_samples[next_sample] = current_avg;
next_sample++;
if (next_sample >= n_avg_samples)
next_sample = 0;
}

View File

@ -40,9 +40,12 @@ void
spinlock_init(SPINLOCK *lock)
{
lock->lock = 0;
#ifdef DEBUG
#ifdef SPINLOCK_PROFILE
lock->spins = 0;
lock->acquired = 0;
lock->waiting = 0;
lock->max_waiting = 0;
lock->contended = 0;
#endif
}
@ -54,16 +57,29 @@ spinlock_init(SPINLOCK *lock)
void
spinlock_acquire(SPINLOCK *lock)
{
#ifdef SPINLOCK_PROFILE
int spins = 0;
atomic_add(&(lock->waiting), 1);
#endif
while (atomic_add(&(lock->lock), 1) != 0)
{
atomic_add(&(lock->lock), -1);
#ifdef DEBUG
#ifdef SPINLOCK_PROFILE
atomic_add(&(lock->spins), 1);
spins++;
#endif
}
#ifdef DEBUG
#ifdef SPINLOCK_PROFILE
if (spins)
{
lock->contended++;
if (lock->maxspins < spins)
lock->maxspins = spins;
}
lock->acquired++;
lock->owner = THREAD_SHELF();
atomic_add(&(lock->waiting), -1);
#endif
}
@ -71,7 +87,7 @@ spinlock_acquire(SPINLOCK *lock)
* Acquire a spinlock if it is not already locked.
*
* @param lock The spinlock to acquire
* @return True ifthe spinlock was acquired, otherwise false
* @return True if the spinlock was acquired, otherwise false
*/
int
spinlock_acquire_nowait(SPINLOCK *lock)
@ -81,7 +97,7 @@ spinlock_acquire_nowait(SPINLOCK *lock)
atomic_add(&(lock->lock), -1);
return FALSE;
}
#ifdef DEBUG
#ifdef SPINLOCK_PROFILE
lock->acquired++;
lock->owner = THREAD_SHELF();
#endif
@ -96,5 +112,45 @@ spinlock_acquire_nowait(SPINLOCK *lock)
void
spinlock_release(SPINLOCK *lock)
{
#ifdef SPINLOCK_PROFILE
if (lock->waiting > lock->max_waiting)
lock->max_waiting = lock->waiting;
#endif
atomic_add(&(lock->lock), -1);
}
/**
* Report statistics on a spinlock. This only has an effect if the
* spinlock code has been compiled with the SPINLOCK_PROFILE option set.
*
* NB A callback function is used to return the data rather than
* merely printing to a DCB in order to avoid a dependency on the DCB
* form the spinlock code and also to facilitate other uses of the
* statistics reporting.
*
* @param lock The spinlock to report on
* @param reporter The callback function to pass the statistics to
* @param hdl A handle that is passed to the reporter function
*/
void
spinlock_stats(SPINLOCK *lock, void (*reporter)(void *, char *, int), void *hdl)
{
#ifdef SPINLOCK_PROFILE
reporter(hdl, "Spinlock acquired", lock->acquired);
if (lock->acquired)
{
reporter(hdl, "Total no. of spins", lock->spins);
reporter(hdl, "Average no. of spins (overall)",
lock->spins / lock->acquired);
if (lock->contended)
reporter(hdl, "Average no. of spins (when contended)",
lock->spins / lock->contended);
reporter(hdl, "Maximum no. of spins", lock->maxspins);
reporter(hdl, "Maximim no. of blocked threads",
lock->max_waiting);
reporter(hdl, "Contended locks", lock->contended);
reporter(hdl, "Contention percentage",
(lock->contended * 100) / lock->acquired);
}
#endif
}

View File

@ -83,6 +83,7 @@ typedef struct {
*/
typedef struct gwbuf {
struct gwbuf *next; /*< Next buffer in a linked chain of buffers */
struct gwbuf *tail; /*< Last buffer in a linked chain of buffers */
void *start; /*< Start of the valid data */
void *end; /*< First byte after the valid data */
SHARED_BUF *sbuf; /*< The shared buffer with the real data */

View File

@ -53,6 +53,7 @@ struct service;
* 07/02/2014 Massimiliano Pinto Added ipv4 data struct into for dcb
* 07/05/2014 Mark Riddoch Addition of callback mechanism
* 08/05/2014 Mark Riddoch Addition of writeq high and low watermarks
* 27/08/2014 Mark Ridddoch Addition of write event queuing
*
* @endverbatim
*/
@ -107,12 +108,16 @@ typedef struct gw_protocol {
* The statitics gathered on a descriptor control block
*/
typedef struct dcbstats {
int n_reads; /*< Number of reads on this descriptor */
int n_writes; /*< Number of writes on this descriptor */
int n_accepts; /*< Number of accepts on this descriptor */
int n_buffered; /*< Number of buffered writes */
int n_high_water; /*< Number of crosses of high water mark */
int n_low_water; /*< Number of crosses of low water mark */
int n_reads; /*< Number of reads on this descriptor */
int n_writes; /*< Number of writes on this descriptor */
int n_accepts; /*< Number of accepts on this descriptor */
int n_buffered; /*< Number of buffered writes */
int n_high_water; /*< Number of crosses of high water mark */
int n_low_water; /*< Number of crosses of low water mark */
int n_busypolls; /*< Number of read polls whiel reading */
int n_readrechecks; /*< Number of rechecks for reads */
int n_busywrpolls; /*< Number of write polls while writing */
int n_writerechecks;/*< Number of rechecks for writes */
} DCBSTATS;
/**
@ -231,6 +236,13 @@ typedef struct dcb {
DCBMM memdata; /**< The data related to DCB memory management */
SPINLOCK cb_lock; /**< The lock for the callbacks linked list */
DCB_CALLBACK *callbacks; /**< The list of callbacks for the DCB */
SPINLOCK pollinlock;
int pollinbusy;
int readcheck;
SPINLOCK polloutlock;
int polloutbusy;
int writecheck;
unsigned int high_water; /**< High water mark */
unsigned int low_water; /**< Low water mark */
@ -259,6 +271,8 @@ int fail_accept_errno;
#define DCB_BELOW_LOW_WATER(x) ((x)->low_water && (x)->writeqlen < (x)->low_water)
#define DCB_ABOVE_HIGH_WATER(x) ((x)->high_water && (x)->writeqlen > (x)->high_water)
void dcb_pollin(DCB *, int);
void dcb_pollout(DCB *, int);
DCB *dcb_get_zombies(void);
int gw_write(
#if defined(SS_DEBUG)
@ -289,7 +303,7 @@ void dcb_hashtable_stats(DCB *, void *); /**< Print statisitics */
void dcb_add_to_zombieslist(DCB* dcb);
int dcb_add_callback(DCB *, DCB_REASON, int (*)(struct dcb *, DCB_REASON, void *),
void *);
int dcb_remove_callback(DCB *, DCB_REASON, int (*)(struct dcb *, DCB_REASON),
int dcb_remove_callback(DCB *, DCB_REASON, int (*)(struct dcb *, DCB_REASON, void *),
void *);
int dcb_isvalid(DCB *); /* Check the DCB is in the linked list */

View File

@ -0,0 +1,50 @@
#ifndef _HOUSEKEEPER_H
#define _HOUSEKEEPER_H
/*
* This file is distributed as part of the SkySQL Gateway. It is free
* software: you can redistribute it and/or modify it under the terms of the
* GNU General Public License as published by the Free Software Foundation,
* version 2.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along with
* this program; if not, write to the Free Software Foundation, Inc., 51
* Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Copyright SkySQL Ab 2014
*/
#include <time.h>
/**
* @file housekeeper.h A mechanism to have task run periodically
*
* @verbatim
* Revision History
*
* Date Who Description
* 29/08/14 Mark Riddoch Initial implementation
*
* @endverbatim
*/
/**
* The housekeeper task list
*/
typedef struct hktask {
char *name; /*< A simple task name */
void (*task)(void *data); /*< The task to call */
void *data; /*< Data to pass the task */
int frequency; /*< How often to call the tasks (seconds) */
time_t nextdue; /*< When the task should be next run */
struct hktask
*next; /*< Next task in the list */
} HKTASK;
extern void hkinit();
extern int hktask_add(char *name, void (*task)(void *), void *data, int frequency);
extern int hktask_remove(char *name);
#endif

View File

@ -41,4 +41,5 @@ extern void poll_waitevents(void *);
extern void poll_shutdown();
extern GWBITMASK *poll_bitmask();
extern void dprintPollStats(DCB *);
extern void dShowThreads(DCB *dcb);
#endif

View File

@ -21,7 +21,7 @@
/**
* @file spinlock.h
*
* Spinlock implementation for ther gateway.
* Spinlock implementation for MaxScale.
*
* Spinlocks are cheap locks that can be used to protect short code blocks, they are
* generally wasteful as any blocked threads will spin, consuming CPU cycles, waiting
@ -31,12 +31,28 @@
#include <thread.h>
#include <stdbool.h>
#define SPINLOCK_PROFILE 1
/**
* The spinlock structure.
*
* In normal builds the structure merely contains a lock value which
* is 0 if the spinlock is not taken and greater than zero if it is held.
*
* In builds with the SPINLOCK_PROFILE option set this structure also holds
* a number of profile related fields that count the number of spins, number
* of waiting threads and the number of times the lock has been acquired.
*/
typedef struct spinlock {
int lock;
#if DEBUG
int spins;
int acquired;
THREAD owner;
int lock; /*< Is the lock held? */
#if SPINLOCK_PROFILE
int spins; /*< Number of spins on this lock */
int maxspins; /*< Max no of spins to acquire lock */
int acquired; /*< No. of times lock was acquired */
int waiting; /*< No. of threads acquiring this lock */
int max_waiting; /*< Max no of threads waiting for lock */
int contended; /*< No. of times acquire was contended */
THREAD owner; /*< Last owner of this lock */
#endif
} SPINLOCK;
@ -47,8 +63,8 @@ typedef struct spinlock {
#define FALSE false
#endif
#if DEBUG
#define SPINLOCK_INIT { 0, 0, 0, NULL }
#if SPINLOCK_PROFILE
#define SPINLOCK_INIT { 0, 0, 0, 0, 0, 0, 0, 0 }
#else
#define SPINLOCK_INIT { 0 }
#endif
@ -59,4 +75,6 @@ extern void spinlock_init(SPINLOCK *lock);
extern void spinlock_acquire(SPINLOCK *lock);
extern int spinlock_acquire_nowait(SPINLOCK *lock);
extern void spinlock_release(SPINLOCK *lock);
extern void spinlock_stats(SPINLOCK *lock,
void (*reporter)(void *, char *, int), void *hdl);
#endif

View File

@ -0,0 +1,367 @@
#ifndef _BLR_H
#define _BLR_H
/*
* This file is distributed as part of MaxScale. It is free
* software: you can redistribute it and/or modify it under the terms of the
* GNU General Public License as published by the Free Software Foundation,
* version 2.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along with
* this program; if not, write to the Free Software Foundation, Inc., 51
* Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Copyright SkySQL Ab 2014
*/
/**
* @file blr.h - The binlog router header file
*
* @verbatim
* Revision History
*
* Date Who Description
* 02/04/14 Mark Riddoch Initial implementation
*
* @endverbatim
*/
#include <dcb.h>
#include <buffer.h>
#include <pthread.h>
#define BINLOG_FNAMELEN 16
#define BLR_PROTOCOL "MySQLBackend"
#define BINLOG_MAGIC { 0xfe, 0x62, 0x69, 0x6e }
#define BINLOG_NAMEFMT "%s.%06d"
#define BINLOG_NAME_ROOT "mysql-bin"
/**
* High and Low water marks for the slave dcb. These values can be overriden
* by the router options highwater and lowwater.
*/
#define DEF_LOW_WATER 20000
#define DEF_HIGH_WATER 300000
/**
* Some useful macros for examining the MySQL Response packets
*/
#define MYSQL_RESPONSE_OK(buf) (*((uint8_t *)GWBUF_DATA(buf) + 4) == 0x00)
#define MYSQL_RESPONSE_EOF(buf) (*((uint8_t *)GWBUF_DATA(buf) + 4) == 0xfe)
#define MYSQL_RESPONSE_ERR(buf) (*((uint8_t *)GWBUF_DATA(buf) + 4) == 0xff)
#define MYSQL_ERROR_CODE(buf) (*((uint8_t *)GWBUF_DATA(buf) + 5))
#define MYSQL_ERROR_MSG(buf) ((uint8_t *)GWBUF_DATA(buf) + 6)
#define MYSQL_COMMAND(buf) (*((uint8_t *)GWBUF_DATA(buf) + 4))
/**
* Slave statistics
*/
typedef struct {
int n_events; /*< Number of events sent */
int n_bursts; /*< Number of bursts sent */
int n_requests; /*< Number of requests received */
int n_flows; /*< Number of flow control restarts */
int n_catchupnr; /*< No. of times catchup resulted in not entering loop */
int n_alreadyupd;
int n_upd;
int n_cb;
int n_cbna;
int n_dcb;
int n_above;
int n_failed_read;
int n_overrun;
int n_actions[3];
} SLAVE_STATS;
/**
* The client session structure used within this router. This represents
* the slaves that are replicating binlogs from MaxScale.
*/
typedef struct router_slave {
#if defined(SS_DEBUG)
skygw_chk_t rses_chk_top;
#endif
DCB *dcb; /*< The slave server DCB */
int state; /*< The state of this slave */
int binlog_pos; /*< Binlog position for this slave */
char binlogfile[BINLOG_FNAMELEN+1];
/*< Current binlog file for this slave */
int serverid; /*< Server-id of the slave */
char *hostname; /*< Hostname of the slave, if known */
char *user; /*< Username if given */
char *passwd; /*< Password if given */
short port; /*< MySQL port */
int nocrc; /*< Disable CRC */
int overrun;
uint32_t rank; /*< Replication rank */
uint8_t seqno; /*< Replication dump sequence no */
SPINLOCK catch_lock; /*< Event catchup lock */
unsigned int cstate; /*< Catch up state */
SPINLOCK rses_lock; /*< Protects rses_deleted */
pthread_t pthread;
struct router_instance
*router; /*< Pointer to the owning router */
struct router_slave *next;
SLAVE_STATS stats; /*< Slave statistics */
#if defined(SS_DEBUG)
skygw_chk_t rses_chk_tail;
#endif
} ROUTER_SLAVE;
/**
* The statistics for this router instance
*/
typedef struct {
int n_slaves; /*< Number slave sessions created */
int n_reads; /*< Number of record reads */
uint64_t n_binlogs; /*< Number of binlog records from master */
uint64_t n_binlog_errors;/*< Number of binlog records from master */
uint64_t n_rotates; /*< Number of binlog rotate events */
uint64_t n_cachehits; /*< Number of hits on the binlog cache */
uint64_t n_cachemisses; /*< Number of misses on the binlog cache */
int n_registered; /*< Number of registered slaves */
int n_masterstarts; /*< Number of times connection restarted */
int n_delayedreconnects;
int n_residuals; /*< Number of times residual data was buffered */
int n_heartbeats; /*< Number of heartbeat messages */
time_t lastReply;
uint64_t n_fakeevents; /*< Fake events not written to disk */
uint64_t n_artificial; /*< Artificial events not written to disk */
uint64_t events[0x24]; /*< Per event counters */
} ROUTER_STATS;
/**
* Saved responses from the master that will be forwarded to slaves
*/
typedef struct {
GWBUF *server_id; /*< Master server id */
GWBUF *heartbeat; /*< Heartbeat period */
GWBUF *chksum1; /*< Binlog checksum 1st response */
GWBUF *chksum2; /*< Binlog checksum 2nd response */
GWBUF *gtid_mode; /*< GTID Mode response */
GWBUF *uuid; /*< Master UUID */
GWBUF *setslaveuuid; /*< Set Slave UUID */
GWBUF *setnames; /*< Set NAMES latin1 */
GWBUF *utf8; /*< Set NAMES utf8 */
GWBUF *select1; /*< select 1 */
GWBUF *selectver; /*< select version() */
uint8_t *fde_event; /*< Format Description Event */
int fde_len; /*< Length of fde_event */
} MASTER_RESPONSES;
/**
* The binlog record structure. This contains the actual packet received from the
* master, the binlog position of the data in the packet, a point to the data and
* the length of the binlog record.
*
* This allows requests for binlog records in the cache to be serviced by simply
* sending the exact same packet as was received by MaxScale from the master.
* Items are written to the backing file as soon as they are received. The binlog
* cache is flushed of old records periodically, releasing the GWBUF's back to the
* free memory pool.
*/
typedef struct {
unsigned long position; /*< binlog record position for this cache entry */
GWBUF *pkt; /*< The packet received from the master */
unsigned char *data; /*< Pointer to the data within the packet */
unsigned int record_len; /*< Binlog record length */
} BLCACHE_RECORD;
/**
* The binlog cache. A cache exists for each file that hold cached bin log records.
* Typically the router will hold two binlog caches, one for the current file and one
* for the previous file.
*/
typedef struct {
char filename[BINLOG_FNAMELEN+1];
BLCACHE_RECORD *first;
BLCACHE_RECORD *current;
int cnt;
} BLCACHE;
/**
* The per instance data for the router.
*/
typedef struct router_instance {
SERVICE *service; /*< Pointer to the service using this router */
ROUTER_SLAVE *slaves; /*< Link list of all the slave connections */
SPINLOCK lock; /*< Spinlock for the instance data */
char *uuid; /*< UUID for the router to use w/master */
int masterid; /*< Server ID of the master */
int serverid; /*< Server ID to use with master */
char *user; /*< User name to use with master */
char *password; /*< Password to use with master */
char *fileroot; /*< Root of binlog filename */
DCB *master; /*< DCB for master connection */
DCB *client; /*< DCB for dummy client */
SESSION *session; /*< Fake session for master connection */
unsigned int master_state; /*< State of the master FSM */
uint8_t lastEventReceived;
GWBUF *residual; /*< Any residual binlog event */
MASTER_RESPONSES saved_master; /*< Saved master responses */
char binlog_name[BINLOG_FNAMELEN+1];
/*< Name of the current binlog file */
uint64_t binlog_position;
/*< Current binlog position */
int binlog_fd; /*< File descriptor of the binlog
* file being written
*/
unsigned int low_water; /*< Low water mark for client DCB */
unsigned int high_water; /*< High water mark for client DCB */
BLCACHE *cache[2];
ROUTER_STATS stats; /*< Statistics for this router */
int active_logs;
int reconnect_pending;
int handling_threads;
struct router_instance
*next;
} ROUTER_INSTANCE;
/**
* Packet header for replication messages
*/
typedef struct rep_header {
int payload_len; /*< Payload length (24 bits) */
uint8_t seqno; /*< Response sequence number */
uint8_t ok; /*< OK Byte from packet */
uint32_t timestamp; /*< Timestamp - start of binlog record */
uint8_t event_type; /*< Binlog event type */
uint32_t serverid; /*< Server id of master */
uint32_t event_size; /*< Size of header, post-header and body */
uint32_t next_pos; /*< Position of next event */
uint16_t flags; /*< Event flags */
} REP_HEADER;
/**
* State machine for the master to MaxScale replication
*/
#define BLRM_UNCONNECTED 0x0000
#define BLRM_AUTHENTICATED 0x0001
#define BLRM_TIMESTAMP 0x0002
#define BLRM_SERVERID 0x0003
#define BLRM_HBPERIOD 0x0004
#define BLRM_CHKSUM1 0x0005
#define BLRM_CHKSUM2 0x0006
#define BLRM_GTIDMODE 0x0007
#define BLRM_MUUID 0x0008
#define BLRM_SUUID 0x0009
#define BLRM_LATIN1 0x000A
#define BLRM_UTF8 0x000B
#define BLRM_SELECT1 0x000C
#define BLRM_SELECTVER 0x000D
#define BLRM_REGISTER 0x000E
#define BLRM_BINLOGDUMP 0x000F
#define BLRM_MAXSTATE 0x000F
static char *blrm_states[] = { "Unconnected", "Authenticated", "Timestamp retrieval",
"Server ID retrieval", "HeartBeat Period setup", "binlog checksum config",
"binlog checksum rerieval", "GTID Mode retrieval", "Master UUID retrieval",
"Set Slave UUID", "Set Names latin1", "Set Names utf8", "select 1",
"select version()", "Register slave", "Binlog Dump" };
#define BLRS_CREATED 0x0000
#define BLRS_UNREGISTERED 0x0001
#define BLRS_REGISTERED 0x0002
#define BLRS_DUMPING 0x0003
#define BLRS_MAXSTATE 0x0003
static char *blrs_states[] = { "Created", "Unregistered", "Registered",
"Sending binlogs" };
/**
* Slave catch-up status
*/
#define CS_READING 0x0001
#define CS_INNERLOOP 0x0002
#define CS_UPTODATE 0x0004
#define CS_EXPECTCB 0x0008
#define CS_DIST 0x0010
#define CS_DISTLATCH 0x0020
/**
* MySQL protocol OpCodes needed for replication
*/
#define COM_QUIT 0x01
#define COM_QUERY 0x03
#define COM_REGISTER_SLAVE 0x15
#define COM_BINLOG_DUMP 0x12
/**
* Binlog event types
*/
#define START_EVENT_V3 0x01
#define QUERY_EVENT 0x02
#define STOP_EVENT 0x03
#define ROTATE_EVENT 0x04
#define INTVAR_EVENT 0x05
#define LOAD_EVENT 0x06
#define SLAVE_EVENT 0x07
#define CREATE_FILE_EVENT 0x08
#define APPEND_BLOCK_EVENT 0x09
#define EXEC_LOAD_EVENT 0x0A
#define DELETE_FILE_EVENT 0x0B
#define NEW_LOAD_EVENT 0x0C
#define RAND_EVENT 0x0D
#define USER_VAR_EVENT 0x0E
#define FORMAT_DESCRIPTION_EVENT 0x0F
#define XID_EVENT 0x10
#define BEGIN_LOAD_QUERY_EVENT 0x11
#define EXECUTE_LOAD_QUERY_EVENT 0x12
#define TABLE_MAP_EVENT 0x13
#define WRITE_ROWS_EVENTv0 0x14
#define UPDATE_ROWS_EVENTv0 0x15
#define DELETE_ROWS_EVENTv0 0x16
#define WRITE_ROWS_EVENTv1 0x17
#define UPDATE_ROWS_EVENTv1 0x18
#define DELETE_ROWS_EVENTv1 0x19
#define INCIDENT_EVENT 0x1A
#define HEARTBEAT_EVENT 0x1B
#define IGNORABLE_EVENT 0x1C
#define ROWS_QUERY_EVENT 0x1D
#define WRITE_ROWS_EVENTv2 0x1E
#define UPDATE_ROWS_EVENTv2 0x1F
#define DELETE_ROWS_EVENTv2 0x20
#define GTID_EVENT 0x21
#define ANONYMOUS_GTID_EVENT 0x22
#define PREVIOUS_GTIDS_EVENT 0x23
/**
* Binlog event flags
*/
#define LOG_EVENT_BINLOG_IN_USE_F 0x0001
#define LOG_EVENT_FORCED_ROTATE_F 0x0002
#define LOG_EVENT_THREAD_SPECIFIC_F 0x0004
#define LOG_EVENT_SUPPRESS_USE_F 0x0008
#define LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F 0x0010
#define LOG_EVENT_ARTIFICIAL_F 0x0020
#define LOG_EVENT_RELAY_LOG_F 0x0040
#define LOG_EVENT_IGNORABLE_F 0x0080
#define LOG_EVENT_NO_FILTER_F 0x0100
#define LOG_EVENT_MTS_ISOLATE_F 0x0200
/*
* Externals within the router
*/
extern void blr_start_master(ROUTER_INSTANCE *);
extern void blr_master_response(ROUTER_INSTANCE *, GWBUF *);
extern void blr_master_reconnect(ROUTER_INSTANCE *);
extern int blr_slave_request(ROUTER_INSTANCE *, ROUTER_SLAVE *, GWBUF *);
extern void blr_slave_rotate(ROUTER_SLAVE *slave, uint8_t *ptr);
extern int blr_slave_catchup(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave);
extern void blr_init_cache(ROUTER_INSTANCE *);
extern void blr_file_init(ROUTER_INSTANCE *);
extern int blr_open_binlog(ROUTER_INSTANCE *, char *);
extern void blr_write_binlog_record(ROUTER_INSTANCE *, REP_HEADER *,uint8_t *);
extern void blr_file_rotate(ROUTER_INSTANCE *, char *, uint64_t);
extern void blr_file_flush(ROUTER_INSTANCE *);
extern GWBUF *blr_read_binlog(int, unsigned int, REP_HEADER *);
#endif

View File

@ -235,6 +235,7 @@ maxscaled_error(DCB *dcb)
static int
maxscaled_hangup(DCB *dcb)
{
dcb_close(dcb);
return 0;
}
@ -313,9 +314,11 @@ maxscaled_close(DCB *dcb)
MAXSCALED *maxscaled = dcb->protocol;
if (maxscaled && maxscaled->username)
{
free(maxscaled->username);
maxscaled->username = NULL;
}
dcb_close(dcb);
return 0;
}

View File

@ -497,7 +497,7 @@ static int gw_read_backend_event(DCB *dcb) {
{
if (nbytes_read < 5)
{
gwbuf_append(dcb->dcb_readqueue, read_buffer);
dcb->dcb_readqueue = gwbuf_append(dcb->dcb_readqueue, read_buffer);
rc = 0;
goto return_rc;
}

View File

@ -798,7 +798,7 @@ int gw_read_client_event(
}
/** succeed */
if (rc == 1) {
if (rc) {
rc = 0; /**< here '0' means success */
} else {
GWBUF* errbuf;

View File

@ -51,6 +51,8 @@ MODULES= libdebugcli.so libreadconnroute.so libtestroute.so libcli.so
all: $(MODULES)
(cd readwritesplit; make)
(cd binlog; make)
libtestroute.so: $(TESTOBJ)
$(CC) $(LDFLAGS) $(TESTOBJ) $(LIBS) -o $@
@ -73,19 +75,23 @@ libreadwritesplit.so:
clean:
$(DEL) $(OBJ) $(MODULES)
(cd readwritesplit; touch depend.mk; make clean)
(cd binlog; touch depend.mk; make clean)
tags:
ctags $(SRCS) $(HDRS)
(cd readwritesplit; make tags)
(cd binlog; make tags)
depend:
@$(DEL) depend.mk
cc -M $(CFLAGS) $(SRCS) > depend.mk
(cd readwritesplit; touch depend.mk ; make depend)
(cd binlog; touch depend.mk ; make depend)
install: $(MODULES)
install -D $(MODULES) $(DEST)/modules
(cd readwritesplit; make DEST=$(DEST) install)
(cd binlog; make DEST=$(DEST) install)
cleantests:
$(MAKE) -C readwritesplit/test cleantests

View File

@ -0,0 +1,65 @@
# This file is distributed as part of the SkySQL Gateway. It is free
# software: you can redistribute it and/or modify it under the terms of the
# GNU General Public License as published by the Free Software Foundation,
# version 2.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along with
# this program; if not, write to the Free Software Foundation, Inc., 51
# Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#
# Copyright SkySQL Ab 2013
#
# Revision History
# Date Who Description
# 2/04/14 Mark Riddoch Initial framework put in place
include ../../../../build_gateway.inc
LOGPATH := $(ROOT_PATH)/log_manager
UTILSPATH := $(ROOT_PATH)/utils
QCLASSPATH := $(ROOT_PATH)/query_classifier
CC=cc
CFLAGS=-c -fPIC -I/usr/include -I../../include -I../../../include \
-I$(LOGPATH) -I$(UTILSPATH) -I$(QCLASSPATH) \
$(MYSQL_HEADERS) -Wall -g
include ../../../../makefile.inc
LDFLAGS=-shared -L$(LOGPATH) -L$(QCLASSPATH) -L$(EMBEDDED_LIB) \
-Wl,-rpath,$(DEST)/lib \
-Wl,-rpath,$(LOGPATH) -Wl,-rpath,$(UTILSPATH) -Wl,-rpath,$(QCLASSPATH) \
-Wl,-rpath,$(EMBEDDED_LIB)
SRCS=blr.c blr_master.c blr_cache.c blr_slave.c blr_file.c
OBJ=$(SRCS:.c=.o)
LIBS=-lssl -pthread -llog_manager -lmysqld
MODULES=libbinlogrouter.so
all: $(MODULES)
$(MODULES): $(OBJ)
$(CC) $(LDFLAGS) $(OBJ) $(UTILSPATH)/skygw_utils.o $(LIBS) -o $@
.c.o:
$(CC) $(CFLAGS) $< -o $@
clean:
rm -f $(OBJ) $(MODULES)
tags:
ctags $(SRCS) $(HDRS)
depend:
@rm -f depend.mk
cc -M $(CFLAGS) $(SRCS) > depend.mk
install: $(MODULES)
install -D $(MODULES) $(DEST)/MaxScale/modules
include depend.mk

View File

@ -0,0 +1,53 @@
The binlog router is not a "normal" MaxScale router, it is not
designed to be used to route client requests to a database in the
usual proxy fashion. Rather it is designed to allow MaxScale to be
used as a relay server in a MySQL replication environment.
In this environment MaxScale sits between a master MySQL server and
a set of slave servers. The slaves servers execute a change master
to the MaxScale server, otehrwise they are configured in exactly
the same way as a normal MySQL slave server.
The master server configuration is unaltered, it simply sees a
single slave server.
MaxScale is configured as usual, with a service definition that
references the binlog router. The major configuration option to
consider is the router_options paramter, in the binlog router this
provides the binlog specific configuration parameters.
uuid=
This is the UUID that MaxScale uses when it connects
to the real master. It will report the master's
UUID to slaves that connect to it.
server-id=
The server-id that MaxScale uses when it connects
to the real master server. Again it will reports
the master's server-id to the slaves that connect
to it.
user=
The user that MaxScale uses to login to the real
master
password=
The password that MaxScale uses to login to the
real master
master-id=
The server-id of the real master. MaxScale should
get this by sending a query, but at the moment it
is in the configuration file for ease of implementation
An example binlog service configuration is shown below:
[Binlog Service]
type=service
router=binlogrouter
servers=master
router_options=uuid=f12fcb7f-b97b-11e3-bc5e-0401152c4c22,server-id=3,user=repl,password=slavepass,master-id=1
user=maxscale
passwd=Mhu87p2D
The servers list for a binlog router service should contain just
the master server. In future a list will be given and the monitor
used to determine which server is the current master server.

View File

@ -0,0 +1,13 @@
The binlog router contained here is a prototype implementation and
should not be consider as production ready.
The router has been written and tested with MySQL 5.6 as a reference
for the replication behaviour, more investigation and implementation
is likely to be needed in order to use other versions of MySQL,
MariaDB or Percona Server.
To Do List:
1. The router does not implement the replication heartbeat mechanism.
2. Performance measurements have yet to be made.

View File

@ -0,0 +1,770 @@
/*
* This file is distributed as part of MaxScale. It is free
* software: you can redistribute it and/or modify it under the terms of the
* GNU General Public License as published by the Free Software Foundation,
* version 2.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along with
* this program; if not, write to the Free Software Foundation, Inc., 51
* Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Copyright SkySQL Ab 2014
*/
/**
* @file blr.c - binlog router, allows MaxScale to act as an intermediatory for replication
*
* The binlog router is designed to be used in replication environments to
* increase the replication fanout of a master server. It provides a transparant
* mechanism to read the binlog entries for multiple slaves while requiring
* only a single connection to the actual master to support the slaves.
*
* The current prototype implement is designed to support MySQL 5.6 and has
* a number of limitations. This prototype is merely a proof of concept and
* should not be considered production ready.
*
* @verbatim
* Revision History
*
* Date Who Description
* 02/04/2014 Mark Riddoch Initial implementation
*
* @endverbatim
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <service.h>
#include <server.h>
#include <router.h>
#include <atomic.h>
#include <spinlock.h>
#include <blr.h>
#include <dcb.h>
#include <spinlock.h>
#include <time.h>
#include <skygw_types.h>
#include <skygw_utils.h>
#include <log_manager.h>
#include <mysql_client_server_protocol.h>
extern int lm_enabled_logfiles_bitmask;
static char *version_str = "V1.0.6";
/* The router entry points */
static ROUTER *createInstance(SERVICE *service, char **options);
static void *newSession(ROUTER *instance, SESSION *session);
static void closeSession(ROUTER *instance, void *router_session);
static void freeSession(ROUTER *instance, void *router_session);
static int routeQuery(ROUTER *instance, void *router_session, GWBUF *queue);
static void diagnostics(ROUTER *instance, DCB *dcb);
static void clientReply(
ROUTER *instance,
void *router_session,
GWBUF *queue,
DCB *backend_dcb);
static void errorReply(
ROUTER *instance,
void *router_session,
GWBUF *message,
DCB *backend_dcb,
error_action_t action,
bool *succp);
static uint8_t getCapabilities (ROUTER* inst, void* router_session);
/** The module object definition */
static ROUTER_OBJECT MyObject = {
createInstance,
newSession,
closeSession,
freeSession,
routeQuery,
diagnostics,
clientReply,
errorReply,
getCapabilities
};
static bool rses_begin_locked_router_action(ROUTER_SLAVE *);
static void rses_end_locked_router_action(ROUTER_SLAVE *);
static SPINLOCK instlock;
static ROUTER_INSTANCE *instances;
/**
* Implementation of the mandatory version entry point
*
* @return version string of the module
*/
char *
version()
{
return version_str;
}
/**
* The module initialisation routine, called when the module
* is first loaded.
*/
void
ModuleInit()
{
LOGIF(LM, (skygw_log_write(
LOGFILE_MESSAGE,
"Initialise binlog router module %s.\n", version_str)));
spinlock_init(&instlock);
instances = NULL;
}
/**
* The module entry point routine. It is this routine that
* must populate the structure that is referred to as the
* "module object", this is a structure with the set of
* external entry points for this module.
*
* @return The module object
*/
ROUTER_OBJECT *
GetModuleObject()
{
return &MyObject;
}
/**
* Create an instance of the router for a particular service
* within MaxScale.
*
* The process of creating the instance causes the router to register
* with the master server and begin replication of the binlogs from
* the master server to MaxScale.
*
* @param service The service this router is being create for
* @param options An array of options for this query router
*
* @return The instance data for this new instance
*/
static ROUTER *
createInstance(SERVICE *service, char **options)
{
ROUTER_INSTANCE *inst;
char *value;
int i;
if ((inst = calloc(1, sizeof(ROUTER_INSTANCE))) == NULL) {
return NULL;
}
memset(&inst->stats, 0, sizeof(ROUTER_STATS));
memset(&inst->saved_master, 0, sizeof(MASTER_RESPONSES));
inst->service = service;
spinlock_init(&inst->lock);
inst->low_water = DEF_LOW_WATER;
inst->high_water = DEF_HIGH_WATER;
/*
* We only support one server behind this router, since the server is
* the master from which we replicate binlog records. Therefore check
* that only one server has been defined.
*
* A later improvement will be to define multiple servers and have the
* router use the information that is supplied by the monitor to find
* which of these servers is currently the master and replicate from
* that server.
*/
if (service->databases == NULL || service->databases->nextdb != NULL)
{
LOGIF(LE, (skygw_log_write(
LOGFILE_ERROR,
"Error : Exactly one database server may be "
"for use with the binlog router.")));
}
/*
* Process the options.
* We have an array of attrbute values passed to us that we must
* examine. Supported attributes are:
* uuid=
* server-id=
* user=
* password=
* master-id=
* filestem=
* lowwater=
* highwater=
*/
if (options)
{
for (i = 0; options[i]; i++)
{
if ((value = strchr(options[i], '=')) == NULL)
{
LOGIF(LE, (skygw_log_write(
LOGFILE_ERROR, "Warning : Unsupported router "
"option %s for binlog router.",
options[i])));
}
else
{
*value = 0;
value++;
if (strcmp(options[i], "uuid") == 0)
{
inst->uuid = strdup(value);
}
else if (strcmp(options[i], "server-id") == 0)
{
inst->serverid = atoi(value);
}
else if (strcmp(options[i], "user") == 0)
{
inst->user = strdup(value);
}
else if (strcmp(options[i], "password") == 0)
{
inst->password = strdup(value);
}
else if (strcmp(options[i], "master-id") == 0)
{
inst->masterid = atoi(value);
}
else if (strcmp(options[i], "filestem") == 0)
{
inst->fileroot = strdup(value);
}
else if (strcmp(options[i], "lowwater") == 0)
{
inst->low_water = atoi(value);
}
else if (strcmp(options[i], "highwater") == 0)
{
inst->high_water = atoi(value);
}
else
{
LOGIF(LE, (skygw_log_write(
LOGFILE_ERROR,
"Warning : Unsupported router "
"option %s for binlog router.",
options[i])));
}
}
}
if (inst->fileroot == NULL)
inst->fileroot = strdup(BINLOG_NAME_ROOT);
}
/*
* We have completed the creation of the instance data, so now
* insert this router instance into the linked list of routers
* that have been created with this module.
*/
spinlock_acquire(&instlock);
inst->next = instances;
instances = inst;
spinlock_release(&instlock);
inst->active_logs = 0;
inst->reconnect_pending = 0;
inst->handling_threads = 0;
inst->residual = NULL;
inst->slaves = NULL;
inst->next = NULL;
/*
* Initialise the binlog file and position
*/
blr_file_init(inst);
LOGIF(LT, (skygw_log_write(
LOGFILE_TRACE,
"Binlog router: current binlog file is: %s, current position %u\n",
inst->binlog_name, inst->binlog_position)));
/*
* Initialise the binlog cache for this router instance
*/
blr_init_cache(inst);
/*
* Now start the replication from the master to MaxScale
*/
blr_start_master(inst);
return (ROUTER *)inst;
}
/**
* Associate a new session with this instance of the router.
*
* In the case of the binlog router a new session equates to a new slave
* connecting to MaxScale and requesting binlog records. We need to go
* through the slave registration process for this new slave.
*
* @param instance The router instance data
* @param session The session itself
* @return Session specific data for this session
*/
static void *
newSession(ROUTER *instance, SESSION *session)
{
ROUTER_INSTANCE *inst = (ROUTER_INSTANCE *)instance;
ROUTER_SLAVE *slave;
LOGIF(LD, (skygw_log_write_flush(
LOGFILE_DEBUG,
"binlog router: %lu [newSession] new router session with "
"session %p, and inst %p.",
pthread_self(),
session,
inst)));
if ((slave = (ROUTER_SLAVE *)calloc(1, sizeof(ROUTER_SLAVE))) == NULL)
{
LOGIF(LD, (skygw_log_write_flush(
LOGFILE_ERROR,
"Insufficient memory to create new slave session for binlog router")));
return NULL;
}
#if defined(SS_DEBUG)
slave->rses_chk_top = CHK_NUM_ROUTER_SES;
slave->rses_chk_tail = CHK_NUM_ROUTER_SES;
#endif
memset(&slave->stats, 0, sizeof(SLAVE_STATS));
atomic_add(&inst->stats.n_slaves, 1);
slave->state = BLRS_CREATED; /* Set initial state of the slave */
slave->cstate = 0;
slave->pthread = 0;
slave->overrun = 0;
spinlock_init(&slave->catch_lock);
slave->dcb = session->client;
slave->router = inst;
/**
* Add this session to the list of active sessions.
*/
spinlock_acquire(&inst->lock);
slave->next = inst->slaves;
inst->slaves = slave;
spinlock_release(&inst->lock);
CHK_CLIENT_RSES(slave);
return (void *)slave;
}
/**
* The session is no longer required. Shutdown all operation and free memory
* associated with this session. In this case a single session is associated
* to a slave of MaxScale. Therefore this is called when that slave is no
* longer active and should remove of reference to that slave, free memory
* and prevent any further forwarding of binlog records to that slave.
*
* Parameters:
* @param router_instance The instance of the router
* @param router_cli_ses The particular session to free
*
*/
static void freeSession(
ROUTER* router_instance,
void* router_client_ses)
{
ROUTER_INSTANCE *router = (ROUTER_INSTANCE *)router_instance;
ROUTER_SLAVE *slave = (ROUTER_SLAVE *)router_client_ses;
int prev_val;
prev_val = atomic_add(&router->stats.n_slaves, -1);
ss_dassert(prev_val > 0);
/*
* Remove the slave session form the list of slaves that are using the
* router currently.
*/
spinlock_acquire(&router->lock);
if (router->slaves == slave) {
router->slaves = slave->next;
} else {
ROUTER_SLAVE *ptr = router->slaves;
while (ptr != NULL && ptr->next != slave) {
ptr = ptr->next;
}
if (ptr != NULL) {
ptr->next = slave->next;
}
}
spinlock_release(&router->lock);
LOGIF(LD, (skygw_log_write_flush(
LOGFILE_DEBUG,
"%lu [freeSession] Unlinked router_client_session %p from "
"router %p. Connections : %d. ",
pthread_self(),
slave,
router,
prev_val-1)));
if (slave->hostname)
free(slave->hostname);
if (slave->user)
free(slave->user);
if (slave->passwd)
free(slave->passwd);
free(slave);
}
/**
* Close a session with the router, this is the mechanism
* by which a router may cleanup data structure etc.
*
* @param instance The router instance data
* @param router_session The session being closed
*/
static void
closeSession(ROUTER *instance, void *router_session)
{
ROUTER_INSTANCE *router = (ROUTER_INSTANCE *)instance;
ROUTER_SLAVE *slave = (ROUTER_SLAVE *)router_session;
if (slave == NULL)
{
/*
* We must be closing the master session.
*
* TODO: Handle closure of master session
*/
LOGIF(LE, (skygw_log_write_flush(
LOGFILE_ERROR, "Binlog router close session with master")));
blr_master_reconnect(router);
return;
}
CHK_CLIENT_RSES(slave);
/**
* Lock router client session for secure read and update.
*/
if (rses_begin_locked_router_action(slave))
{
/* decrease server registered slaves counter */
atomic_add(&router->stats.n_registered, -1);
/*
* Mark the slave as unregistered to prevent the forwarding
* of any more binlog records to this slave.
*/
slave->state = BLRS_UNREGISTERED;
/* Unlock */
rses_end_locked_router_action(slave);
}
}
/**
* We have data from the client, this is likely to be packets related to
* the registration of the slave to receive binlog records. Unlike most
* MaxScale routers there is no forwarding to the backend database, merely
* the return of either predefined server responses that have been cached
* or binlog records.
*
* @param instance The router instance
* @param router_session The router session returned from the newSession call
* @param queue The queue of data buffers to route
* @return The number of bytes sent
*/
static int
routeQuery(ROUTER *instance, void *router_session, GWBUF *queue)
{
ROUTER_INSTANCE *router = (ROUTER_INSTANCE *)instance;
ROUTER_SLAVE *slave = (ROUTER_SLAVE *)router_session;
return blr_slave_request(router, slave, queue);
}
static char *event_names[] = {
"Invalid", "Start Event V3", "Query Event", "Stop Event", "Rotate Event",
"Integer Session Variable", "Load Event", "Slave Event", "Create File Event",
"Append Block Event", "Exec Load Event", "Delete File Event",
"New Load Event", "Rand Event", "User Variable Event", "Format Description Event",
"Transaction ID Event (2 Phase Commit)", "Begin Load Query Event",
"Execute Load Query Event", "Table Map Event", "Write Rows Event (v0)",
"Update Rows Event (v0)", "Delete Rows Event (v0)", "Write Rows Event (v1)",
"Update Rows Event (v1)", "Delete Rows Event (v1)", "Incident Event",
"Heartbeat Event", "Ignorable Event", "Rows Query Event", "Write Rows Event (v2)",
"Update Rows Event (v2)", "Delete Rows Event (v2)", "GTID Event",
"Anonymous GTID Event", "Previous GTIDS Event"
};
/**
* Display an entry from the spinlock statistics data
*
* @param dcb The DCB to print to
* @param desc Description of the statistic
* @param value The statistic value
*/
static void
spin_reporter(void *dcb, char *desc, int value)
{
dcb_printf((DCB *)dcb, "\t\t%-35s %d\n", desc, value);
}
/**
* Display router diagnostics
*
* @param instance Instance of the router
* @param dcb DCB to send diagnostics to
*/
static void
diagnostics(ROUTER *router, DCB *dcb)
{
ROUTER_INSTANCE *router_inst = (ROUTER_INSTANCE *)router;
ROUTER_SLAVE *session;
int i = 0;
char buf[40];
struct tm tm;
spinlock_acquire(&router_inst->lock);
session = router_inst->slaves;
while (session)
{
i++;
session = session->next;
}
spinlock_release(&router_inst->lock);
dcb_printf(dcb, "\tMaster connection DCB: %p\n",
router_inst->master);
dcb_printf(dcb, "\tMaster connection state: %s\n",
blrm_states[router_inst->master_state]);
localtime_r(&router_inst->stats.lastReply, &tm);
asctime_r(&tm, buf);
dcb_printf(dcb, "\tNumber of master connects: %d\n",
router_inst->stats.n_masterstarts);
dcb_printf(dcb, "\tNumber of delayed reconnects: %d\n",
router_inst->stats.n_delayedreconnects);
dcb_printf(dcb, "\tCurrent binlog file: %s\n",
router_inst->binlog_name);
dcb_printf(dcb, "\tCurrent binlog position: %u\n",
router_inst->binlog_position);
dcb_printf(dcb, "\tNumber of slave servers: %u\n",
router_inst->stats.n_slaves);
dcb_printf(dcb, "\tNumber of binlog events received: %u\n",
router_inst->stats.n_binlogs);
dcb_printf(dcb, "\tNumber of fake binlog events: %u\n",
router_inst->stats.n_fakeevents);
dcb_printf(dcb, "\tNumber of artificial binlog events: %u\n",
router_inst->stats.n_artificial);
dcb_printf(dcb, "\tNumber of binlog events in error: %u\n",
router_inst->stats.n_binlog_errors);
dcb_printf(dcb, "\tNumber of binlog rotate events: %u\n",
router_inst->stats.n_rotates);
dcb_printf(dcb, "\tNumber of binlog cache hits: %u\n",
router_inst->stats.n_cachehits);
dcb_printf(dcb, "\tNumber of binlog cache misses: %u\n",
router_inst->stats.n_cachemisses);
dcb_printf(dcb, "\tNumber of heartbeat events: %u\n",
router_inst->stats.n_heartbeats);
dcb_printf(dcb, "\tNumber of packets received: %u\n",
router_inst->stats.n_reads);
dcb_printf(dcb, "\tNumber of residual data packets: %u\n",
router_inst->stats.n_residuals);
dcb_printf(dcb, "\tAverage events per packet %.1f\n",
(double)router_inst->stats.n_binlogs / router_inst->stats.n_reads);
dcb_printf(dcb, "\tLast event from master at: %s",
buf);
dcb_printf(dcb, "\t (%d seconds ago)\n",
time(0) - router_inst->stats.lastReply);
dcb_printf(dcb, "\tLast event from master: 0x%x\n",
router_inst->lastEventReceived);
if (router_inst->active_logs)
dcb_printf(dcb, "\tRouter processing binlog records\n");
if (router_inst->reconnect_pending)
dcb_printf(dcb, "\tRouter pending reconnect to master\n");
dcb_printf(dcb, "\tEvents received:\n");
for (i = 0; i < 0x24; i++)
{
dcb_printf(dcb, "\t\t%-38s: %u\n", event_names[i], router_inst->stats.events[i]);
}
#if SPINLOCK_PROFILE
dcb_printf(dcb, "\tSpinlock statistics (instlock):\n");
spinlock_stats(&instlock, spin_reporter, dcb);
dcb_printf(dcb, "\tSpinlock statistics (instance lock):\n");
spinlock_stats(&router_inst->lock, spin_reporter, dcb);
#endif
if (router_inst->slaves)
{
dcb_printf(dcb, "\tSlaves:\n");
spinlock_acquire(&router_inst->lock);
session = router_inst->slaves;
while (session)
{
dcb_printf(dcb, "\t\tServer-id: %d\n", session->serverid);
if (session->hostname)
dcb_printf(dcb, "\t\tHostname: %s\n", session->hostname);
dcb_printf(dcb, "\t\tSlave DCB: %p\n", session->dcb);
dcb_printf(dcb, "\t\tNext Sequence No: %d\n", session->seqno);
dcb_printf(dcb, "\t\tState: %s\n", blrs_states[session->state]);
dcb_printf(dcb, "\t\tBinlog file: %s\n", session->binlogfile);
dcb_printf(dcb, "\t\tBinlog position: %u\n", session->binlog_pos);
if (session->nocrc)
dcb_printf(dcb, "\t\tMaster Binlog CRC: None\n");
dcb_printf(dcb, "\t\tNo. requests: %u\n", session->stats.n_requests);
dcb_printf(dcb, "\t\tNo. events sent: %u\n", session->stats.n_events);
dcb_printf(dcb, "\t\tNo. bursts sent: %u\n", session->stats.n_bursts);
dcb_printf(dcb, "\t\tNo. flow control: %u\n", session->stats.n_flows);
dcb_printf(dcb, "\t\tNo. catchup NRs: %u\n", session->stats.n_catchupnr);
dcb_printf(dcb, "\t\tNo. already up to date: %u\n", session->stats.n_alreadyupd);
dcb_printf(dcb, "\t\tNo. up to date: %u\n", session->stats.n_upd);
dcb_printf(dcb, "\t\tNo. of low water cbs %u\n", session->stats.n_cb);
dcb_printf(dcb, "\t\tNo. of drained cbs %u\n", session->stats.n_dcb);
dcb_printf(dcb, "\t\tNo. of low water cbs N/A %u\n", session->stats.n_cbna);
dcb_printf(dcb, "\t\tNo. of events > high water %u\n", session->stats.n_above);
dcb_printf(dcb, "\t\tNo. of failed reads %u\n", session->stats.n_failed_read);
dcb_printf(dcb, "\t\tNo. of nested distribute events %u\n", session->stats.n_overrun);
dcb_printf(dcb, "\t\tNo. of distribute action 1 %u\n", session->stats.n_actions[0]);
dcb_printf(dcb, "\t\tNo. of distribute action 2 %u\n", session->stats.n_actions[1]);
dcb_printf(dcb, "\t\tNo. of distribute action 3 %u\n", session->stats.n_actions[2]);
if ((session->cstate & CS_UPTODATE) == 0)
{
dcb_printf(dcb, "\t\tSlave is in catchup mode. %s\n",
((session->cstate & CS_EXPECTCB) == 0 ? "" :
"Waiting for DCB queue to drain."));
}
else
{
dcb_printf(dcb, "\t\tSlave is in normal mode.\n");
if (session->binlog_pos != router_inst->binlog_position)
{
dcb_printf(dcb, "\t\tSlave reports up to date however "
"the slave binlog position does not match the master\n");
}
}
#if SPINLOCK_PROFILE
dcb_printf(dcb, "\tSpinlock statistics (catch_lock):\n");
spinlock_stats(&session->catch_lock, spin_reporter, dcb);
dcb_printf(dcb, "\tSpinlock statistics (rses_lock):\n");
spinlock_stats(&session->rses_lock, spin_reporter, dcb);
#endif
session = session->next;
}
spinlock_release(&router_inst->lock);
}
}
/**
* Client Reply routine - in this case this is a message from the
* master server, It should be sent to the state machine that manages
* master packets as it may be binlog records or part of the registration
* handshake that takes part during connection establishment.
*
*
* @param instance The router instance
* @param router_session The router session
* @param master_dcb The DCB for the connection to the master
* @param queue The GWBUF with reply data
*/
static void
clientReply(ROUTER *instance, void *router_session, GWBUF *queue, DCB *backend_dcb)
{
ROUTER_INSTANCE *router = (ROUTER_INSTANCE *)instance;
atomic_add(&router->stats.n_reads, 1);
blr_master_response(router, queue);
router->stats.lastReply = time(0);
}
/**
* Error Reply routine
*
* The routine will reply to client errors and/or closing the session
* or try to open a new backend connection.
*
* @param instance The router instance
* @param router_session The router session
* @param message The error message to reply
* @param backend_dcb The backend DCB
* @param action The action: REPLY, REPLY_AND_CLOSE, NEW_CONNECTION
* @param succp Result of action
*
*/
static void
errorReply(ROUTER *instance, void *router_session, GWBUF *message, DCB *backend_dcb, error_action_t action, bool *succp)
{
LOGIF(LE, (skygw_log_write_flush(
LOGFILE_ERROR, "Erorr Reply '%s'", message)));
*succp = false;
}
/** to be inline'd */
/**
* @node Acquires lock to router client session if it is not closed.
*
* Parameters:
* @param rses - in, use
*
*
* @return true if router session was not closed. If return value is true
* it means that router is locked, and must be unlocked later. False, if
* router was closed before lock was acquired.
*
*
* @details (write detailed description here)
*
*/
static bool rses_begin_locked_router_action(ROUTER_SLAVE *rses)
{
bool succp = false;
CHK_CLIENT_RSES(rses);
spinlock_acquire(&rses->rses_lock);
succp = true;
return succp;
}
/** to be inline'd */
/**
* @node Releases router client session lock.
*
* Parameters:
* @param rses - <usage>
* <description>
*
* @return void
*
*
* @details (write detailed description here)
*
*/
static void rses_end_locked_router_action(ROUTER_SLAVE * rses)
{
CHK_CLIENT_RSES(rses);
spinlock_release(&rses->rses_lock);
}
static uint8_t getCapabilities(ROUTER *inst, void *router_session)
{
return 0;
}

View File

@ -0,0 +1,69 @@
/*
* This file is distributed as part of MaxScale. It is free
* software: you can redistribute it and/or modify it under the terms of the
* GNU General Public License as published by the Free Software Foundation,
* version 2.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along with
* this program; if not, write to the Free Software Foundation, Inc., 51
* Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Copyright SkySQL Ab 2014
*/
/**
* @file blr_cache.c - binlog router cache, manage the binlog cache
*
* The binlog router is designed to be used in replication environments to
* increase the replication fanout of a master server. It provides a transparant
* mechanism to read the binlog entries for multiple slaves while requiring
* only a single connection to the actual master to support the slaves.
*
* The current prototype implement is designed to support MySQL 5.6 and has
* a number of limitations. This prototype is merely a proof of concept and
* should not be considered production ready.
*
* @verbatim
* Revision History
*
* Date Who Description
* 07/04/2014 Mark Riddoch Initial implementation
*
* @endverbatim
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <service.h>
#include <server.h>
#include <router.h>
#include <atomic.h>
#include <spinlock.h>
#include <blr.h>
#include <dcb.h>
#include <spinlock.h>
#include <skygw_types.h>
#include <skygw_utils.h>
#include <log_manager.h>
extern int lm_enabled_logfiles_bitmask;
/**
* Initialise the cache for this instanceof the binlog router. As a side
* effect also determine the binlog file to read and the position to read
* from.
*
* @param router The router instance
*/
void
blr_init_cache(ROUTER_INSTANCE *router)
{
}

View File

@ -0,0 +1,346 @@
/*
* This file is distributed as part of MaxScale. It is free
* software: you can redistribute it and/or modify it under the terms of the
* GNU General Public License as published by the Free Software Foundation,
* version 2.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along with
* this program; if not, write to the Free Software Foundation, Inc., 51
* Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Copyright SkySQL Ab 2014
*/
/**
* @file blr_file.c - contains code for the router binlog file management
*
*
* @verbatim
* Revision History
*
* Date Who Description
* 14/04/2014 Mark Riddoch Initial implementation
*
* @endverbatim
*/
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <dirent.h>
#include <fcntl.h>
#include <unistd.h>
#include <service.h>
#include <server.h>
#include <router.h>
#include <atomic.h>
#include <spinlock.h>
#include <blr.h>
#include <dcb.h>
#include <spinlock.h>
#include <skygw_types.h>
#include <skygw_utils.h>
#include <log_manager.h>
extern int lm_enabled_logfiles_bitmask;
static void blr_file_create(ROUTER_INSTANCE *router, char *file);
static void blr_file_append(ROUTER_INSTANCE *router, char *file);
static uint32_t extract_field(uint8_t *src, int bits);
/**
* Initialise the binlog file for this instance. MaxScale will look
* for all the binlogs that it has on local disk, determien the next
* binlog to use and initialise it for writing, determining the
* next record to be fetched from the real master.
*
* @param router The router instance this defines the master for this replication chain
*/
void
blr_file_init(ROUTER_INSTANCE *router)
{
char *ptr, path[1024], filename[1050];
int file_found, n = 1;
int root_len, i;
DIR *dirp;
struct dirent *dp;
strcpy(path, "/usr/local/skysql/MaxScale");
if ((ptr = getenv("MAXSCALE_HOME")) != NULL)
{
strcpy(path, ptr);
}
strcat(path, "/");
strcat(path, router->service->name);
if (access(path, R_OK) == -1)
mkdir(path, 0777);
/* First try to find a binlog file number by reading the directory */
root_len = strlen(router->fileroot);
dirp = opendir(path);
while ((dp = readdir(dirp)) != NULL)
{
if (strncmp(dp->d_name, router->fileroot, root_len) == 0)
{
i = atoi(dp->d_name + root_len + 1);
if (i > n)
n = i;
}
}
closedir(dirp);
file_found = 0;
do {
sprintf(filename, "%s/" BINLOG_NAMEFMT, path, router->fileroot, n);
if (access(filename, R_OK) != -1)
{
file_found = 1;
n++;
}
else
file_found = 0;
} while (file_found);
n--;
if (n == 0) // No binlog files found
{
sprintf(filename, BINLOG_NAMEFMT, router->fileroot, 1);
blr_file_create(router, filename);
}
else
{
sprintf(filename, BINLOG_NAMEFMT, router->fileroot, n);
blr_file_append(router, filename);
}
}
void
blr_file_rotate(ROUTER_INSTANCE *router, char *file, uint64_t pos)
{
blr_file_create(router, file);
}
/**
* Create a new binlog file for the router to use.
*
* @param router The router instance
* @param file The binlog file name
*/
static void
blr_file_create(ROUTER_INSTANCE *router, char *file)
{
char *ptr, path[1024];
int fd;
unsigned char magic[] = BINLOG_MAGIC;
strcpy(path, "/usr/local/skysql/MaxScale");
if ((ptr = getenv("MAXSCALE_HOME")) != NULL)
{
strcpy(path, ptr);
}
strcat(path, "/");
strcat(path, router->service->name);
strcat(path, "/");
strcat(path, file);
if ((fd = open(path, O_RDWR|O_CREAT, 0666)) != -1)
{
write(fd, magic, 4);
}
else
{
LOGIF(LE, (skygw_log_write(LOGFILE_ERROR,
"Failed to create binlog file %s\n", path)));
}
fsync(fd);
close(router->binlog_fd);
strcpy(router->binlog_name, file);
router->binlog_position = 4; /* Initial position after the magic number */
router->binlog_fd = fd;
}
/**
* Prepare an existing binlog file to be appened to.
*
* @param router The router instance
* @param file The binlog file name
*/
static void
blr_file_append(ROUTER_INSTANCE *router, char *file)
{
char *ptr, path[1024];
int fd;
strcpy(path, "/usr/local/skysql/MaxScale");
if ((ptr = getenv("MAXSCALE_HOME")) != NULL)
{
strcpy(path, ptr);
}
strcat(path, "/");
strcat(path, router->service->name);
strcat(path, "/");
strcat(path, file);
if ((fd = open(path, O_RDWR|O_APPEND, 0666)) == -1)
{
LOGIF(LE, (skygw_log_write(LOGFILE_ERROR,
"Failed to open binlog file %s for append.\n",
path)));
return;
}
fsync(fd);
close(router->binlog_fd);
strcpy(router->binlog_name, file);
router->binlog_position = lseek(fd, 0L, SEEK_END);
router->binlog_fd = fd;
}
/**
* Write a binlog entry to disk.
*
* @param router The router instance
* @param buf The binlog record
* @param len The length of the binlog record
*/
void
blr_write_binlog_record(ROUTER_INSTANCE *router, REP_HEADER *hdr, uint8_t *buf)
{
pwrite(router->binlog_fd, buf, hdr->event_size, hdr->next_pos - hdr->event_size);
router->binlog_position = hdr->next_pos;
}
/**
* Flush the content of the binlog file to disk.
*
* @param router The binlog router
*/
void
blr_file_flush(ROUTER_INSTANCE *router)
{
fsync(router->binlog_fd);
}
int
blr_open_binlog(ROUTER_INSTANCE *router, char *binlog)
{
char *ptr, path[1024];
int rval;
strcpy(path, "/usr/local/skysql/MaxScale");
if ((ptr = getenv("MAXSCALE_HOME")) != NULL)
{
strcpy(path, ptr);
}
strcat(path, "/");
strcat(path, router->service->name);
strcat(path, "/");
strcat(path, binlog);
if ((rval = open(path, O_RDONLY, 0666)) == -1)
{
LOGIF(LE, (skygw_log_write(LOGFILE_ERROR,
"Failed to open binlog file %s\n", path)));
}
return rval;
}
/**
* Read a replication event into a GWBUF structure.
*
* @param fd File descriptor of the binlog file
* @param pos Position of binlog record to read
* @param hdr Binlog header to populate
* @return The binlog record wrapped in a GWBUF structure
*/
GWBUF *
blr_read_binlog(int fd, unsigned int pos, REP_HEADER *hdr)
{
uint8_t hdbuf[19];
GWBUF *result;
unsigned char *data;
int n;
if (lseek(fd, pos, SEEK_SET) != pos)
{
LOGIF(LE, (skygw_log_write(LOGFILE_ERROR,
"Failed to seek for binlog entry, "
"at %d.\n", pos)));
return NULL;
}
/* Read the header information from the file */
if ((n = read(fd, hdbuf, 19)) != 19)
{
LOGIF(LE, (skygw_log_write(LOGFILE_ERROR,
"Failed to read header for binlog entry, "
"at %d (%s).\n", pos, strerror(errno))));
if (n> 0 && n < 19)
LOGIF(LE, (skygw_log_write(LOGFILE_ERROR,
"Short read when reading the header. "
"Expected 19 bytes got %d bytes.\n",
n)));
return NULL;
}
hdr->timestamp = extract_field(hdbuf, 32);
hdr->event_type = hdbuf[4];
hdr->serverid = extract_field(&hdbuf[5], 32);
hdr->event_size = extract_field(&hdbuf[9], 32);
hdr->next_pos = extract_field(&hdbuf[13], 32);
hdr->flags = extract_field(&hdbuf[17], 16);
if ((result = gwbuf_alloc(hdr->event_size)) == NULL)
{
LOGIF(LE, (skygw_log_write(LOGFILE_ERROR,
"Failed to allocate memory for binlog entry, "
"size %d at %d.\n",
hdr->event_size, pos)));
return NULL;
}
data = GWBUF_DATA(result);
memcpy(data, hdbuf, 19); // Copy the header in
if ((n = read(fd, &data[19], hdr->event_size - 19))
!= hdr->event_size - 19) // Read the balance
{
LOGIF(LE, (skygw_log_write(LOGFILE_ERROR,
"Short read when reading the event at %d. "
"Expected %d bytes got %d bytes.\n",
pos, n)));
gwbuf_consume(result, hdr->event_size);
return NULL;
}
return result;
}
/**
* Extract a numeric field from a packet of the specified number of bits
*
* @param src The raw packet source
* @param birs The number of bits to extract (multiple of 8)
*/
static uint32_t
extract_field(uint8_t *src, int bits)
{
uint32_t rval = 0, shift = 0;
while (bits > 0)
{
rval |= (*src++) << shift;
shift += 8;
bits -= 8;
}
return rval;
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,944 @@
/*
* This file is distributed as part of MaxScale. It is free
* software: you can redistribute it and/or modify it under the terms of the
* GNU General Public License as published by the Free Software Foundation,
* version 2.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
* details.
*
* You should have received a copy of the GNU General Public License along with
* this program; if not, write to the Free Software Foundation, Inc., 51
* Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Copyright SkySQL Ab 2014
*/
/**
* @file blr_slave.c - contains code for the router to slave communication
*
* The binlog router is designed to be used in replication environments to
* increase the replication fanout of a master server. It provides a transparant
* mechanism to read the binlog entries for multiple slaves while requiring
* only a single connection to the actual master to support the slaves.
*
* The current prototype implement is designed to support MySQL 5.6 and has
* a number of limitations. This prototype is merely a proof of concept and
* should not be considered production ready.
*
* @verbatim
* Revision History
*
* Date Who Description
* 14/04/2014 Mark Riddoch Initial implementation
*
* @endverbatim
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <service.h>
#include <server.h>
#include <router.h>
#include <atomic.h>
#include <spinlock.h>
#include <blr.h>
#include <dcb.h>
#include <spinlock.h>
#include <skygw_types.h>
#include <skygw_utils.h>
#include <log_manager.h>
static uint32_t extract_field(uint8_t *src, int bits);
static void encode_value(unsigned char *data, unsigned int value, int len);
static int blr_slave_query(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave, GWBUF *queue);
static int blr_slave_replay(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave, GWBUF *master);
static void blr_slave_send_error(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave, char *msg);
static int blr_slave_send_timestamp(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave);
static int blr_slave_register(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave, GWBUF *queue);
static int blr_slave_binlog_dump(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave, GWBUF *queue);
int blr_slave_catchup(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave);
static uint8_t *blr_build_header(GWBUF *pkt, REP_HEADER *hdr);
static int blr_slave_callback(DCB *dcb, DCB_REASON reason, void *data);
extern int lm_enabled_logfiles_bitmask;
/**
* Process a request packet from the slave server.
*
* The router can handle a limited subset of requests from the slave, these
* include a subset of general SQL queries, a slave registeration command and
* the binlog dump command.
*
* The strategy for responding to these commands is to use caches responses
* for the the same commands that have previously been made to the real master
* if this is possible, if it is not then the router itself will synthesize a
* response.
*
* @param router The router instance this defines the master for this replication chain
* @param slave The slave specific data
* @param queue The incoming request packet
*/
int
blr_slave_request(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave, GWBUF *queue)
{
if (slave->state < 0 || slave->state > BLRS_MAXSTATE)
{
LOGIF(LE, (skygw_log_write(
LOGFILE_ERROR, "Invalid slave state machine state (%d) for binlog router.\n",
slave->state)));
gwbuf_consume(queue, gwbuf_length(queue));
return 0;
}
atomic_add(&slave->stats.n_requests, 1);
switch (MYSQL_COMMAND(queue))
{
case COM_QUERY:
return blr_slave_query(router, slave, queue);
break;
case COM_REGISTER_SLAVE:
return blr_slave_register(router, slave, queue);
break;
case COM_BINLOG_DUMP:
return blr_slave_binlog_dump(router, slave, queue);
break;
case COM_QUIT:
LOGIF(LD, (skygw_log_write(LOGFILE_DEBUG,
"COM_QUIT received from slave with server_id %d\n",
slave->serverid)));
break;
default:
LOGIF(LE, (skygw_log_write(
LOGFILE_ERROR,
"Unexpected MySQL Command (%d) received from slave\n",
MYSQL_COMMAND(queue))));
break;
}
return 0;
}
/**
* Handle a query from the slave. This is expected to be one of the "standard"
* queries we expect as part of the registraton process. Most of these can
* be dealt with by replying the stored responses we got from the master
* when MaxScale registered as a slave. The exception to the rule is the
* request to obtain the current timestamp value of the server.
*
* Five select statements are currently supported:
* SELECT UNIX_TIMESTAMP();
* SELECT @master_binlog_checksum
* SELECT @@GLOBAL.GTID_MODE
* SELECT VERSION()
* SELECT 1
*
* Two show commands are supported:
* SHOW VARIABLES LIKE 'SERVER_ID'
* SHOW VARIABLES LIKE 'SERVER_UUID'
*
* Five set commands are supported:
* SET @master_binlog_checksum = @@global.binlog_checksum
* SET @master_heartbeat_period=...
* SET @slave_slave_uuid=...
* SET NAMES latin1
* SET NAMES utf8
*
* @param router The router instance this defines the master for this replication chain
* @param slave The slave specific data
* @param queue The incoming request packet
* @return Non-zero if data has been sent
*/
static int
blr_slave_query(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave, GWBUF *queue)
{
char *qtext, *query_text;
char *sep = " ,=";
char *word, *brkb;
int query_len;
qtext = GWBUF_DATA(queue);
query_len = extract_field((uint8_t *)qtext, 24) - 1;
qtext += 5; // Skip header and first byte of the payload
query_text = strndup(qtext, query_len);
LOGIF(LT, (skygw_log_write(
LOGFILE_TRACE, "Execute statement from the slave '%s'\n", query_text)));
/*
* Implement a very rudimental "parsing" of the query text by extarcting the
* words from the statement and matchng them against the subset of queries we
* are expecting from the slave. We already have responses to these commands,
* except for the select of UNIX_TIMESTAMP(), that we have saved from MaxScale's
* own interaction with the real master. We simply replay these saved responses
* to the slave.
*/
word = strtok_r(query_text, sep, &brkb);
if (strcasecmp(word, "SELECT") == 0)
{
word = strtok_r(NULL, sep, &brkb);
if (strcasecmp(word, "UNIX_TIMESTAMP()") == 0)
{
free(query_text);
return blr_slave_send_timestamp(router, slave);
}
else if (strcasecmp(word, "@master_binlog_checksum") == 0)
{
free(query_text);
return blr_slave_replay(router, slave, router->saved_master.chksum2);
}
else if (strcasecmp(word, "@@GLOBAL.GTID_MODE") == 0)
{
free(query_text);
return blr_slave_replay(router, slave, router->saved_master.gtid_mode);
}
else if (strcasecmp(word, "1") == 0)
{
free(query_text);
return blr_slave_replay(router, slave, router->saved_master.select1);
}
else if (strcasecmp(word, "VERSION()") == 0)
{
free(query_text);
return blr_slave_replay(router, slave, router->saved_master.selectver);
}
}
else if (strcasecmp(word, "SHOW") == 0)
{
word = strtok_r(NULL, sep, &brkb);
if (strcasecmp(word, "VARIABLES") == 0)
{
word = strtok_r(NULL, sep, &brkb);
if (strcasecmp(word, "LIKE") == 0)
{
word = strtok_r(NULL, sep, &brkb);
if (strcasecmp(word, "'SERVER_ID'") == 0)
{
free(query_text);
return blr_slave_replay(router, slave, router->saved_master.server_id);
}
else if (strcasecmp(word, "'SERVER_UUID'") == 0)
{
free(query_text);
return blr_slave_replay(router, slave, router->saved_master.uuid);
}
}
}
}
else if (strcasecmp(query_text, "SET") == 0)
{
word = strtok_r(NULL, sep, &brkb);
if (strcasecmp(word, "@master_heartbeat_period") == 0)
{
free(query_text);
return blr_slave_replay(router, slave, router->saved_master.heartbeat);
}
else if (strcasecmp(word, "@master_binlog_checksum") == 0)
{
word = strtok_r(NULL, sep, &brkb);
if (strcasecmp(word, "'none'") == 0)
slave->nocrc = 1;
else
slave->nocrc = 0;
free(query_text);
return blr_slave_replay(router, slave, router->saved_master.chksum1);
}
else if (strcasecmp(word, "@slave_uuid") == 0)
{
free(query_text);
return blr_slave_replay(router, slave, router->saved_master.setslaveuuid);
}
else if (strcasecmp(word, "NAMES") == 0)
{
word = strtok_r(NULL, sep, &brkb);
if (strcasecmp(word, "latin1") == 0)
{
free(query_text);
return blr_slave_replay(router, slave, router->saved_master.setnames);
}
else if (strcasecmp(word, "utf8") == 0)
{
free(query_text);
return blr_slave_replay(router, slave, router->saved_master.utf8);
}
}
}
free(query_text);
query_text = strndup(qtext, query_len);
LOGIF(LE, (skygw_log_write(
LOGFILE_ERROR, "Unexpected query from slave server %s\n", query_text)));
free(query_text);
blr_slave_send_error(router, slave, "Unexpected SQL query received from slave.");
return 0;
}
/**
* Send a reply to a command we have received from the slave. The reply itself
* is merely a copy of a previous message we received from the master when we
* registered as a slave. Hence we just replay this saved reply.
*
* @param router The binlog router instance
* @param slave The slave server to which we are sending the response
* @param master The saved master response
* @return Non-zero if data was sent
*/
static int
blr_slave_replay(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave, GWBUF *master)
{
GWBUF *clone;
if (!master)
return 0;
if ((clone = gwbuf_clone(master)) != NULL)
{
return slave->dcb->func.write(slave->dcb, clone);
}
else
{
LOGIF(LE, (skygw_log_write(LOGFILE_ERROR,
"Failed to clone server response to send to slave.\n")));
return 0;
}
}
/**
* Construct an error response
*
* @param router The router instance
* @param slave The slave server instance
* @param msg The error message to send
*/
static void
blr_slave_send_error(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave, char *msg)
{
GWBUF *pkt;
unsigned char *data;
int len;
if ((pkt = gwbuf_alloc(strlen(msg) + 13)) == NULL)
return;
data = GWBUF_DATA(pkt);
len = strlen(msg) + 1;
encode_value(&data[0], len, 24); // Payload length
data[3] = 0; // Sequence id
// Payload
data[4] = 0xff; // Error indicator
data[5] = 0; // Error Code
data[6] = 0; // Error Code
strncpy((char *)&data[7], "#00000", 6);
memcpy(&data[13], msg, strlen(msg)); // Error Message
slave->dcb->func.write(slave->dcb, pkt);
}
/*
* Some standard packets that have been captured from a network trace of server
* interactions. These packets are the schema definition sent in response to
* a SELECT UNIX_TIMESTAMP() statement and the EOF packet that marks the end
* of transmission of the result set.
*/
static uint8_t timestamp_def[] = {
0x01, 0x00, 0x00, 0x01, 0x01, 0x26, 0x00, 0x00, 0x02, 0x03, 0x64, 0x65, 0x66, 0x00, 0x00, 0x00,
0x10, 0x55, 0x4e, 0x49, 0x58, 0x5f, 0x54, 0x49, 0x4d, 0x45, 0x53, 0x54, 0x41, 0x4d, 0x50, 0x28,
0x29, 0x00, 0x0c, 0x3f, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x08, 0x81, 0x00, 0x00, 0x00, 0x00, 0x05,
0x00, 0x00, 0x03, 0xfe, 0x00, 0x00, 0x02, 0x00
};
static uint8_t timestamp_eof[] = { 0x05, 0x00, 0x00, 0x05, 0xfe, 0x00, 0x00, 0x02, 0x00 };
/**
* Send a response to a "SELECT UNIX_TIMESTAMP()" request. This differs from the other
* requests since we do not save a copy of the original interaction with the master
* and simply replay it. We want to always send the current time. We have stored a typcial
* response, which gives us the schema information normally returned. This is sent to the
* client and then we add a dynamic part that will insert the current timestamp data.
* Finally we send a preprepaed EOF packet to end the response stream.
*
* @param router The binlog router instance
* @param slave The slave server to which we are sending the response
* @return Non-zero if data was sent
*/
static int
blr_slave_send_timestamp(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave)
{
GWBUF *pkt;
char timestamp[20];
uint8_t *ptr;
int len, ts_len;
sprintf(timestamp, "%ld", time(0));
ts_len = strlen(timestamp);
len = sizeof(timestamp_def) + sizeof(timestamp_eof) + 5 + ts_len;
if ((pkt = gwbuf_alloc(len)) == NULL)
return 0;
ptr = GWBUF_DATA(pkt);
memcpy(ptr, timestamp_def, sizeof(timestamp_def)); // Fixed preamble
ptr += sizeof(timestamp_def);
encode_value(ptr, ts_len + 1, 24); // Add length of data packet
ptr += 3;
*ptr++ = 0x04; // Sequence number in response
*ptr++ = ts_len; // Length of result string
strncpy((char *)ptr, timestamp, ts_len); // Result string
ptr += ts_len;
memcpy(ptr, timestamp_eof, sizeof(timestamp_eof)); // EOF packet to terminate result
return slave->dcb->func.write(slave->dcb, pkt);
}
/**
* Process a slave replication registration message.
*
* We store the various bits of information the slave gives us and generate
* a reply message.
*
* @param router The router instance
* @param slave The slave server
* @param queue The BINLOG_DUMP packet
* @return Non-zero if data was sent
*/
static int
blr_slave_register(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave, GWBUF *queue)
{
GWBUF *resp;
uint8_t *ptr;
int len, slen;
ptr = GWBUF_DATA(queue);
len = extract_field(ptr, 24);
ptr += 4; // Skip length and sequence number
if (*ptr++ != COM_REGISTER_SLAVE)
return 0;
slave->serverid = extract_field(ptr, 32);
ptr += 4;
slen = *ptr++;
if (slen != 0)
{
slave->hostname = strndup((char *)ptr, slen);
ptr += slen;
}
else
slave->hostname = NULL;
slen = *ptr++;
if (slen != 0)
{
ptr += slen;
slave->user = strndup((char *)ptr, slen);
}
else
slave->user = NULL;
slen = *ptr++;
if (slen != 0)
{
slave->passwd = strndup((char *)ptr, slen);
ptr += slen;
}
else
slave->passwd = NULL;
slave->port = extract_field(ptr, 16);
ptr += 2;
slave->rank = extract_field(ptr, 32);
/*
* Now construct a response
*/
if ((resp = gwbuf_alloc(11)) == NULL)
return 0;
ptr = GWBUF_DATA(resp);
encode_value(ptr, 7, 24); // Payload length
ptr += 3;
*ptr++ = 1; // Sequence number
encode_value(ptr, 0, 24);
ptr += 3;
encode_value(ptr, slave->serverid, 32);
slave->state = BLRS_REGISTERED;
return slave->dcb->func.write(slave->dcb, resp);
}
/**
* Process a COM_BINLOG_DUMP message from the slave. This is the
* final step in the process of registration. The new master, MaxScale
* must send a response packet and generate a fake BINLOG_ROTATE event
* with the binlog file requested by the slave. And then send a
* FORMAT_DESCRIPTION_EVENT that has been saved from the real master.
*
* Once send MaxScale must continue to send binlog events to the slave.
*
* @param router The router instance
* @param slave The slave server
* @param queue The BINLOG_DUMP packet
* @return The number of bytes written to the slave
*/
static int
blr_slave_binlog_dump(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave, GWBUF *queue)
{
GWBUF *resp;
uint8_t *ptr;
int len, flags, serverid, rval;
REP_HEADER hdr;
uint32_t chksum;
ptr = GWBUF_DATA(queue);
len = extract_field(ptr, 24);
ptr += 4; // Skip length and sequence number
if (*ptr++ != COM_BINLOG_DUMP)
{
LOGIF(LE, (skygw_log_write(
LOGFILE_ERROR,
"blr_slave_binlog_dump expected a COM_BINLOG_DUMP but received %d\n",
*(ptr-1))));
return 0;
}
slave->binlog_pos = extract_field(ptr, 32);
ptr += 4;
flags = extract_field(ptr, 16);
ptr += 2;
serverid = extract_field(ptr, 32);
ptr += 4;
strncpy(slave->binlogfile, (char *)ptr, BINLOG_FNAMELEN);
slave->state = BLRS_DUMPING;
slave->seqno = 1;
if (slave->nocrc)
len = 0x2b;
else
len = 0x2f;
// Build a fake rotate event
resp = gwbuf_alloc(len + 5);
hdr.payload_len = len + 1;
hdr.seqno = slave->seqno++;
hdr.ok = 0;
hdr.timestamp = 0L;
hdr.event_type = ROTATE_EVENT;
hdr.serverid = router->masterid;
hdr.event_size = len;
hdr.next_pos = 0;
hdr.flags = 0x20;
ptr = blr_build_header(resp, &hdr);
encode_value(ptr, slave->binlog_pos, 64);
ptr += 8;
memcpy(ptr, slave->binlogfile, BINLOG_FNAMELEN);
ptr += BINLOG_FNAMELEN;
if (!slave->nocrc)
{
/*
* Now add the CRC to the fake binlog rotate event.
*
* The algorithm is first to compute the checksum of an empty buffer
* and then the checksum of the event portion of the message, ie we do not
* include the length, sequence number and ok byte that makes up the first
* 5 bytes of the message. We also do not include the 4 byte checksum itself.
*/
chksum = crc32(0L, NULL, 0);
chksum = crc32(chksum, GWBUF_DATA(resp) + 5, hdr.event_size - 4);
encode_value(ptr, chksum, 32);
}
rval = slave->dcb->func.write(slave->dcb, resp);
/* Send the FORMAT_DESCRIPTION_EVENT */
if (router->saved_master.fde_event)
{
resp = gwbuf_alloc(router->saved_master.fde_len + 5);
ptr = GWBUF_DATA(resp);
encode_value(ptr, router->saved_master.fde_len + 1, 24); // Payload length
ptr += 3;
*ptr++ = slave->seqno++;
*ptr++ = 0; // OK
memcpy(ptr, router->saved_master.fde_event, router->saved_master.fde_len);
encode_value(ptr, time(0), 32); // Overwrite timestamp
/*
* Since we have changed the timestamp we must recalculate the CRC
*
* Position ptr to the start of the event header,
* calculate a new checksum
* and write it into the header
*/
ptr = GWBUF_DATA(resp) + 5 + router->saved_master.fde_len - 4;
chksum = crc32(0L, NULL, 0);
chksum = crc32(chksum, GWBUF_DATA(resp) + 5, router->saved_master.fde_len - 4);
encode_value(ptr, chksum, 32);
rval = slave->dcb->func.write(slave->dcb, resp);
}
slave->dcb->low_water = router->low_water;
slave->dcb->high_water = router->high_water;
dcb_add_callback(slave->dcb, DCB_REASON_LOW_WATER, blr_slave_callback, slave);
dcb_add_callback(slave->dcb, DCB_REASON_DRAINED, blr_slave_callback, slave);
if (slave->binlog_pos != router->binlog_position ||
strcmp(slave->binlogfile, router->binlog_name) != 0)
{
spinlock_acquire(&slave->catch_lock);
slave->cstate &= ~CS_UPTODATE;
spinlock_release(&slave->catch_lock);
rval = blr_slave_catchup(router, slave);
}
return rval;
}
/**
* Extract a numeric field from a packet of the specified number of bits,
* the number of bits must be a multiple of 8.
*
* @param src The raw packet source
* @param bits The number of bits to extract (multiple of 8)
* @return The extracted value
*/
static uint32_t
extract_field(uint8_t *src, int bits)
{
uint32_t rval = 0, shift = 0;
while (bits > 0)
{
rval |= (*src++) << shift;
shift += 8;
bits -= 8;
}
return rval;
}
/**
* Encode a value into a number of bits in a MySQL packet
*
* @param data Pointer to location in target packet
* @param value The value to encode into the buffer
* @param len Number of bits to encode value into
*/
static void
encode_value(unsigned char *data, unsigned int value, int len)
{
while (len > 0)
{
*data++ = value & 0xff;
value >>= 8;
len -= 8;
}
}
/**
* Populate a header structure for a replication message from a GWBUF structure.
*
* @param pkt The incoming packet in a GWBUF chain
* @param hdr The packet header to populate
* @return A pointer to the first byte following the event header
*/
static uint8_t *
blr_build_header(GWBUF *pkt, REP_HEADER *hdr)
{
uint8_t *ptr;
ptr = GWBUF_DATA(pkt);
encode_value(ptr, hdr->payload_len, 24);
ptr += 3;
*ptr++ = hdr->seqno;
*ptr++ = hdr->ok;
encode_value(ptr, hdr->timestamp, 32);
ptr += 4;
*ptr++ = hdr->event_type;
encode_value(ptr, hdr->serverid, 32);
ptr += 4;
encode_value(ptr, hdr->event_size, 32);
ptr += 4;
encode_value(ptr, hdr->next_pos, 32);
ptr += 4;
encode_value(ptr, hdr->flags, 16);
ptr += 2;
return ptr;
}
/**
* We have a registered slave that is behind the current leading edge of the
* binlog. We must replay the log entries to bring this node up to speed.
*
* There may be a large numebr of records to send to the slave, the process
* is triggered by the slave COM_BINLOG_DUMP message and all the events must
* be sent without receiving any new event. This measn there is no trigger into
* MaxScale other than this initial message. However, if we simply send all the
* events we end up with an extremely long write queue on the DCB and risk running
* the server out of resources.
*
* To resolve this the concept of high and low water marks within the DCB has been
* added, with the ability for the DCB code to call user defined callbacks when the
* write queue is completely drained, when it crosses above the high water mark and
* when it crosses below the low water mark.
*
* The blr_slave_catchup routine will send binlog events to the slave until the high
* water mark is reached, at which point it will return. Later, when a low water mark
* callback is generated by the code that drains the DCB of data the blr_slave_catchup
* routine will again be called to write more events. The process is repeated until
* the slave has caught up with the master.
*
* Note: an additional check that the DCB is still above the low water mark is done
* prior to the return from this function to allow for any delays due to the call to
* the close system call, since this may cause thread rescheduling.
*
* @param router The binlog router
* @param slave The slave that is behind
* @return The number of bytes written
*/
int
blr_slave_catchup(ROUTER_INSTANCE *router, ROUTER_SLAVE *slave)
{
GWBUF *head, *record;
REP_HEADER hdr;
int written, fd, rval = 1, burst = 0;
uint8_t *ptr;
struct timespec req;
spinlock_acquire(&slave->catch_lock);
slave->cstate &= ~CS_EXPECTCB;
spinlock_release(&slave->catch_lock);
doitagain:
/*
* We have a slightly complex syncronisation mechansim here,
* we need to make sure that we do not have multiple threads
* running the catchup loop, but we need to be very careful
* that we do not loose a call that is coming via a callback
* call as this will stall the binlog catchup process.
*
* We don't want to simply use a traditional mutex here for
* the loop, since this would block a MaxScale thread for
* an unacceptable length of time.
*
* We have two status bits, the CS_READING that says we are
* in the outer loop and the CS_INNERLOOP, to say we are in
* the inner loop.
*
* If just CS_READING is set the other thread may be about to
* enter the inner loop or may be about to exit the function
* completely. Therefore we have to wait to see if CS_READING
* is cleared or CS_INNERLOOP is set.
*
* If CS_READING gets cleared then this thread should proceed
* into the loop.
*
* If CS_INNERLOOP get's set then this thread does not need to
* proceed.
*
* If CS_READING is not set then this thread simply enters the
* loop.
*/
req.tv_sec = 0;
req.tv_nsec = 1000;
spinlock_acquire(&slave->catch_lock);
if (slave->cstate & CS_UPTODATE)
{
LOGIF(LM, (skygw_log_write(LOGFILE_MESSAGE,
"blr_slave_catchup called with up to date slave %d at "
"%s@%d. Reading position %s@%d\n",
slave->serverid, slave->binlogfile,
slave->binlog_pos, router->binlog_name,
router->binlog_position)));
slave->stats.n_alreadyupd++;
spinlock_release(&slave->catch_lock);
return 1;
}
while (slave->cstate & CS_READING)
{
// Wait until we know what the other thread is doing
while ((slave->cstate & (CS_READING|CS_INNERLOOP)) == CS_READING)
{
spinlock_release(&slave->catch_lock);
nanosleep(&req, NULL);
spinlock_acquire(&slave->catch_lock);
}
// Other thread is in the innerloop
if ((slave->cstate & (CS_READING|CS_INNERLOOP)) == (CS_READING|CS_INNERLOOP))
{
spinlock_release(&slave->catch_lock);
LOGIF(LM, (skygw_log_write(
LOGFILE_MESSAGE,
"blr_slave_catchup thread returning due to "
"lock being held by another thread. %s@%d\n",
slave->binlogfile,
slave->binlog_pos)));
slave->stats.n_catchupnr++;
return 1; // We cheat here and return 1 because otherwise
// an error would be sent and we do not want that
}
/* Release the lock for a short time to allow the other
* thread to exit the outer reading loop.
*/
spinlock_release(&slave->catch_lock);
nanosleep(&req, NULL);
spinlock_acquire(&slave->catch_lock);
}
if (slave->pthread)
LOGIF(LD, (skygw_log_write(LOGFILE_DEBUG, "Multiple threads sending to same thread.\n")));
slave->pthread = pthread_self();
slave->cstate |= CS_READING;
spinlock_release(&slave->catch_lock);
if (DCB_ABOVE_HIGH_WATER(slave->dcb))
LOGIF(LT, (skygw_log_write(LOGFILE_TRACE, "blr_slave_catchup above high water on entry.\n")));
do {
if ((fd = blr_open_binlog(router, slave->binlogfile)) == -1)
{
spinlock_acquire(&slave->catch_lock);
slave->cstate &= ~CS_READING;
spinlock_release(&slave->catch_lock);
LOGIF(LE, (skygw_log_write(
LOGFILE_ERROR,
"blr_slave_catchup failed to open binlog file %s\n",
slave->binlogfile)));
return 0;
}
atomic_add(&slave->stats.n_bursts, 1);
spinlock_acquire(&slave->catch_lock);
slave->cstate |= CS_INNERLOOP;
spinlock_release(&slave->catch_lock);
while ((!DCB_ABOVE_HIGH_WATER(slave->dcb)) &&
(record = blr_read_binlog(fd, slave->binlog_pos, &hdr)) != NULL)
{
if (hdr.event_size > DEF_HIGH_WATER) slave->stats.n_above++;
head = gwbuf_alloc(5);
ptr = GWBUF_DATA(head);
encode_value(ptr, hdr.event_size + 1, 24);
ptr += 3;
*ptr++ = slave->seqno++;
*ptr++ = 0; // OK
head = gwbuf_append(head, record);
if (hdr.event_type == ROTATE_EVENT)
{
close(fd);
blr_slave_rotate(slave, GWBUF_DATA(record));
if ((fd = blr_open_binlog(router, slave->binlogfile)) == -1)
{
LOGIF(LE, (skygw_log_write(
LOGFILE_ERROR,
"blr_slave_catchup failed to open binlog file %s\n",
slave->binlogfile)));
break;
}
}
written = slave->dcb->func.write(slave->dcb, head);
if (written && hdr.event_type != ROTATE_EVENT)
{
slave->binlog_pos = hdr.next_pos;
}
rval = written;
atomic_add(&slave->stats.n_events, 1);
burst++;
}
if (record == NULL)
slave->stats.n_failed_read++;
spinlock_acquire(&slave->catch_lock);
slave->cstate &= ~CS_INNERLOOP;
spinlock_release(&slave->catch_lock);
close(fd);
} while (record && DCB_BELOW_LOW_WATER(slave->dcb));
if (record)
{
atomic_add(&slave->stats.n_flows, 1);
spinlock_acquire(&slave->catch_lock);
slave->cstate |= CS_EXPECTCB;
spinlock_release(&slave->catch_lock);
}
else
{
int state_change = 0;
spinlock_acquire(&slave->catch_lock);
if ((slave->cstate & CS_UPTODATE) == 0)
{
atomic_add(&slave->stats.n_upd, 1);
slave->cstate |= CS_UPTODATE;
state_change = 1;
}
spinlock_release(&slave->catch_lock);
if (state_change)
LOGIF(LM, (skygw_log_write(LOGFILE_MESSAGE,
"blr_slave_catchup slave is up to date %s, %u\n",
slave->binlogfile, slave->binlog_pos)));
}
spinlock_acquire(&slave->catch_lock);
#if 0
if (slave->pthread != pthread_self())
{
LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, "Multple threads in catchup for same slave: %x and %x\n", slave->pthread, pthread_self())));
abort();
}
#endif
slave->pthread = 0;
#if 0
if (DCB_BELOW_LOW_WATER(slave->dcb) && slave->binlog_pos != router->binlog_position) abort();
#endif
slave->cstate &= ~CS_READING;
spinlock_release(&slave->catch_lock);
if (DCB_BELOW_LOW_WATER(slave->dcb) && slave->binlog_pos != router->binlog_position)
{
LOGIF(LE, (skygw_log_write(LOGFILE_ERROR, "Expected to be above low water\n")));
goto doitagain;
}
return rval;
}
/**
* The DCB callback used by the slave to obtain DCB_REASON_LOW_WATER callbacks
* when the server sends all the the queue data for a DCB. This is the mechanism
* that is used to implement the flow control mechanism for the sending of
* large quantities of binlog records during the catchup process.
*
* @param dcb The DCB of the slave connection
* @param reason The reason the callback was called
* @param data The user data, in this case the server structure
*/
static int
blr_slave_callback(DCB *dcb, DCB_REASON reason, void *data)
{
ROUTER_SLAVE *slave = (ROUTER_SLAVE *)data;
ROUTER_INSTANCE *router = slave->router;
if (reason == DCB_REASON_DRAINED)
{
if (slave->state == BLRS_DUMPING &&
slave->binlog_pos != router->binlog_position)
{
atomic_add(&slave->stats.n_dcb, 1);
blr_slave_catchup(router, slave);
}
}
if (reason == DCB_REASON_LOW_WATER)
{
if (slave->state == BLRS_DUMPING)
{
atomic_add(&slave->stats.n_cb, 1);
blr_slave_catchup(router, slave);
}
else
{
atomic_add(&slave->stats.n_cbna, 1);
}
}
return 0;
}
/**
* Rotate the slave to the new binlog file
*
* @param slave The slave instance
* @param ptr The rotate event (minux header and OK byte)
*/
void
blr_slave_rotate(ROUTER_SLAVE *slave, uint8_t *ptr)
{
ptr += 19; // Skip header
slave->binlog_pos = extract_field(ptr, 32);
slave->binlog_pos += (extract_field(ptr+4, 32) << 32);
memcpy(slave->binlogfile, ptr + 8, BINLOG_FNAMELEN);
slave->binlogfile[BINLOG_FNAMELEN] = 0;
}

View File

@ -160,6 +160,10 @@ struct subcommand showoptions[] = {
"Show all active sessions in MaxScale",
"Show all active sessions in MaxScale",
{0, 0, 0} },
{ "threads", 0, dShowThreads,
"Show the status of the polling threads in MaxScale",
"Show the status of the polling threads in MaxScale",
{0, 0, 0} },
{ "users", 0, telnetdShowUsers,
"Show statistics and user names for the debug interface",
"Show statistics and user names for the debug interface",
@ -208,6 +212,10 @@ struct subcommand listoptions[] = {
"List all the active sessions within MaxScale",
"List all the active sessions within MaxScale",
{0, 0, 0} },
{ "threads", 0, dShowThreads,
"List the status of the polling threads in MaxScale",
"List the status of the polling threads in MaxScale",
{0, 0, 0} },
{ NULL, 0, NULL, NULL, NULL,
{0, 0, 0} }
};