Improved diagnostics

Added master reconnect on failure

Added EPOLLRDHUP events
This commit is contained in:
Mark Riddoch 2014-06-07 21:01:11 +01:00
parent bb0e6c3858
commit 13e95ffc53
7 changed files with 180 additions and 32 deletions

View File

@ -602,6 +602,15 @@ config_threadcount()
return gateway.n_threads;
}
static struct {
char *logname;
logfile_id_t logfile;
} lognames[] = {
{ "log_messages", LOGFILE_MESSAGE },
{ "log_trace", LOGFILE_TRACE },
{ "log_debug", LOGFILE_DEBUG },
{ NULL, 0 }
};
/**
* Configuration handler for items in the global [MaxScale] section
*
@ -612,10 +621,20 @@ config_threadcount()
static int
handle_global_item(const char *name, const char *value)
{
int i;
if (strcmp(name, "threads") == 0) {
gateway.n_threads = atoi(value);
} else {
return 0;
for (i = 0; lognames[i].logname; i++)
{
if (strcasecmp(name, lognames[i].logname) == 0)
{
if (atoi(value))
skygw_log_enable(lognames[i].logfile);
else
skygw_log_disable(lognames[i].logfile);
}
}
}
return 1;
}

View File

@ -1131,6 +1131,7 @@ dprintDCB(DCB *pdcb, DCB *dcb)
dcb_printf(pdcb, "\tConnected to: %s\n", dcb->remote);
dcb_printf(pdcb, "\tOwning Session: %d\n", dcb->session);
dcb_printf(pdcb, "\tQueued write data: %d\n", gwbuf_length(dcb->writeq));
dcb_printf(pdcb, "\tDelayed write data: %d\n", gwbuf_length(dcb->delayq));
dcb_printf(pdcb, "\tStatistics:\n");
dcb_printf(pdcb, "\t\tNo. of Reads: %d\n", dcb->stats.n_reads);
dcb_printf(pdcb, "\t\tNo. of Writes: %d\n", dcb->stats.n_writes);

View File

@ -99,7 +99,7 @@ poll_add_dcb(DCB *dcb)
CHK_DCB(dcb);
ev.events = EPOLLIN | EPOLLOUT | EPOLLET;
ev.events = EPOLLIN | EPOLLOUT | EPOLLRDHUP | EPOLLET;
ev.data.ptr = dcb;
/*<
@ -474,6 +474,25 @@ poll_waitevents(void *arg)
atomic_add(&pollStats.n_hup, 1);
dcb->func.hangup(dcb);
}
if (ev & EPOLLRDHUP)
{
int eno = 0;
eno = gw_getsockerrno(dcb->fd);
LOGIF(LD, (skygw_log_write(
LOGFILE_DEBUG,
"%lu [poll_waitevents] "
"EPOLLRDHUP on dcb %p, fd %d. "
"Errno %d, %s.",
pthread_self(),
dcb,
dcb->fd,
eno,
strerror(eno))));
atomic_add(&pollStats.n_hup, 1);
dcb->func.hangup(dcb);
}
} /*< for */
no_op = FALSE;
}

View File

@ -110,6 +110,11 @@ typedef struct {
uint64_t n_cachehits; /*< Number of hits on the binlog cache */
uint64_t n_cachemisses; /*< Number of misses on the binlog cache */
unsigned int n_registered; /*< Number of registered slaves */
int n_masterstarts; /*< Numebr of times connection restarted */
int n_queueadd; /*< Numebr of times incoming data was added to processign queue */
int n_residuals; /*< Number of times residual data was buffered */
unsigned int n_heartbeats; /*< Number of heartbeat messages */
time_t lastReply;
uint64_t n_fakeevents; /*< Fake events not written to disk */
uint64_t events[0x24]; /*< Per event counters */
} ROUTER_STATS;
@ -175,8 +180,10 @@ typedef struct router_instance {
char *password; /*< Password to use with master */
char *fileroot; /*< Root of binlog filename */
DCB *master; /*< DCB for master connection */
DCB *client; /*< DCB for dummy client */
SESSION *session; /*< Fake session for master connection */
unsigned int master_state; /*< State of the master FSM */
uint8_t lastEventReceived;
GWBUF *residual; /*< Any residual binlog event */
MASTER_RESPONSES saved_master; /*< Saved master responses */
char binlog_name[BINLOG_FNAMELEN+1];

View File

@ -792,6 +792,31 @@ return_fd:
static int
gw_backend_hangup(DCB *dcb)
{
SESSION *session;
void *rsession;
ROUTER_OBJECT *router;
ROUTER *router_instance;
int rc = 0;
session = dcb->session;
if (session->state == SESSION_STATE_ROUTER_READY)
{
router = session->service->router;
router_instance = session->service->router_instance;
rsession = session->router_session;
/*<
* rsession should never be NULL here.
*/
LOGIF(LD, (skygw_log_write_flush(
LOGFILE_DEBUG,
"%lu [gw_backend_hangup] "
"Call closeSession for backend "
"session.",
pthread_self())));
router->closeSession(router_instance, rsession);
}
/*< vraa : errorHandle */
return 1;
}

View File

@ -39,6 +39,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <service.h>
#include <server.h>
#include <router.h>
@ -47,6 +48,7 @@
#include <blr.h>
#include <dcb.h>
#include <spinlock.h>
#include <time.h>
#include <skygw_types.h>
#include <skygw_utils.h>
@ -436,8 +438,13 @@ ROUTER_SLAVE *slave = (ROUTER_SLAVE *)router_session;
*
* TODO: Handle closure of master session
*/
LOGIF(LD, (skygw_log_write_flush(
LOGIF(LE, (skygw_log_write_flush(
LOGFILE_ERROR, "Binlog router close session with master")));
router->master_state = BLRM_UNCONNECTED;
dcb_close(router->master);
dcb_free(router->master);
dcb_free(router->client);
blr_start_master(router);
return;
}
CHK_CLIENT_RSES(slave);
@ -504,8 +511,10 @@ static void
diagnostics(ROUTER *router, DCB *dcb)
{
ROUTER_INSTANCE *router_inst = (ROUTER_INSTANCE *)router;
ROUTER_SLAVE *session;
int i = 0;
ROUTER_SLAVE *session;
int i = 0;
char buf[40];
struct tm tm;
spinlock_acquire(&router_inst->lock);
session = router_inst->slaves;
@ -515,7 +524,17 @@ int i = 0;
session = session->next;
}
spinlock_release(&router_inst->lock);
dcb_printf(dcb, "\tMaster connection DCB: %p\n",
router_inst->master);
dcb_printf(dcb, "\tMaster connection state: %s\n",
blrm_states[router_inst->master_state]);
localtime_r(&router_inst->stats.lastReply, &tm);
asctime_r(&tm, buf);
dcb_printf(dcb, "\tNumber of master connects: %d\n",
router_inst->stats.n_masterstarts);
dcb_printf(dcb, "\tCurrent binlog file: %s\n",
router_inst->binlog_name);
dcb_printf(dcb, "\tCurrent binlog position: %u\n",
@ -524,7 +543,7 @@ int i = 0;
router_inst->stats.n_slaves);
dcb_printf(dcb, "\tNumber of binlog events received: %u\n",
router_inst->stats.n_binlogs);
dcb_printf(dcb, "\tNumber of fake binlog events: %u\n",
dcb_printf(dcb, "\tNumber of fake binlog events: %u\n",
router_inst->stats.n_fakeevents);
dcb_printf(dcb, "\tNumber of binlog events in error: %u\n",
router_inst->stats.n_binlog_errors);
@ -534,10 +553,23 @@ int i = 0;
router_inst->stats.n_cachehits);
dcb_printf(dcb, "\tNumber of binlog cache misses: %u\n",
router_inst->stats.n_cachemisses);
dcb_printf(dcb, "\tNumber of heartbeat events: %u\n",
router_inst->stats.n_heartbeats);
dcb_printf(dcb, "\tNumber of packets received: %u\n",
router_inst->stats.n_reads);
dcb_printf(dcb, "\tNumber of packets queued: %u\n",
router_inst->stats.n_queueadd);
dcb_printf(dcb, "\tCurrent length of incoming queue: %d\n",
gwbuf_length(router_inst->queue));
dcb_printf(dcb, "\tNumber of residual data packets: %u\n",
router_inst->stats.n_residuals);
dcb_printf(dcb, "\tAverage events per packet %.1f\n",
(double)router_inst->stats.n_binlogs / router_inst->stats.n_reads);
dcb_printf(dcb, "\tLast event from master at: %s\n", buf);
dcb_printf(dcb, "\tLast event from master: 0x%x\n",
router_inst->lastEventReceived);
if (router_inst->active_logs)
dcb_printf(dcb, "\tRouter processing binlog records\n");
dcb_printf(dcb, "\tEvents received:\n");
for (i = 0; i < 0x24; i++)
{
@ -596,6 +628,7 @@ ROUTER_INSTANCE *router = (ROUTER_INSTANCE *)instance;
atomic_add(&router->stats.n_reads, 1);
blr_master_response(router, queue);
router->stats.lastReply = time(0);
}
/**
@ -619,6 +652,8 @@ errorReply(
DCB *backend_dcb,
int action)
{
LOGIF(LE, (skygw_log_write_flush(
LOGFILE_ERROR, "Erorr Reply '%s'", message)));
}
/** to be inline'd */

View File

@ -48,6 +48,9 @@
#include <dcb.h>
#include <spinlock.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <skygw_types.h>
#include <skygw_utils.h>
#include <log_manager.h>
@ -68,6 +71,8 @@ static void *CreateMySQLAuthData(char *username, char *password, char *database)
static void blr_extract_header(uint8_t *pkt, REP_HEADER *hdr);
static uint32_t extract_field(uint8_t *src, int bits);
static int keepalive = 1;
/**
* blr_start_master - controls the connection of the binlog router to the
* master MySQL server and triggers the slave registration process for
@ -81,15 +86,37 @@ blr_start_master(ROUTER_INSTANCE *router)
DCB *client;
GWBUF *buf;
client = dcb_alloc(DCB_ROLE_INTERNAL);
if ((client = dcb_alloc(DCB_ROLE_INTERNAL)) == NULL)
{
LOGIF(LE, (skygw_log_write_flush(LOGFILE_ERROR,
"Binlog router: failed to create DCB for dummy client\n")));
return;
}
router->client = client;
client->data = CreateMySQLAuthData(router->user, router->password, "");
router->session = session_alloc(router->service, client);
if ((router->session = session_alloc(router->service, client)) == NULL)
{
LOGIF(LE, (skygw_log_write_flush(LOGFILE_ERROR,
"Binlog router: failed to create session for connection to master\n")));
return;
}
client->session = router->session;
router->master = dcb_connect(router->service->databases, router->session, BLR_PROTOCOL);
if ((router->master = dcb_connect(router->service->databases, router->session, BLR_PROTOCOL)) == NULL)
{
LOGIF(LE, (skygw_log_write_flush(LOGFILE_ERROR,
"Binlog router: failed to connect to master\n")));
return;
}
if (setsockopt(router->master->fd, SOL_SOCKET, SO_KEEPALIVE, &keepalive , sizeof(keepalive )))
perror("setsockopt");
router->master_state = BLRM_AUTHENTICATED;
buf = blr_make_query("SELECT UNIX_TIMESTAMP()");
router->master->func.write(router->master, buf);
router->master_state = BLRM_TIMESTAMP;
router->stats.n_masterstarts++;
}
/**
@ -131,6 +158,7 @@ char query[128];
* to the point that it will not look at new packets
* added to the queue.
*/
router->stats.n_queueadd++;
router->queue = gwbuf_append(router->queue, buf);
spinlock_release(&router->lock);
return;
@ -305,16 +333,17 @@ int len = 18;
if ((buf = gwbuf_alloc(len + 4)) == NULL)
return NULL;
data = GWBUF_DATA(buf);
encode_value(&data[0], len, 24); // Payload length
data[3] = 0; // Sequence ID
data[4] = COM_REGISTER_SLAVE; // Command
encode_value(&data[5], router->serverid, 32); // Slave Server ID
data[9] = 0; // Slave hostname length
data[10] = 0; // Slave username length
data[11] = 0; // Slave password length
encode_value(&data[12], router->service->ports->port, 16); // Slave master port
encode_value(&data[14], 0, 32); // Replication rank
encode_value(&data[18], router->masterid, 32); // Master server-id
encode_value(&data[0], len, 24); // Payload length
data[3] = 0; // Sequence ID
data[4] = COM_REGISTER_SLAVE; // Command
encode_value(&data[5], router->serverid, 32); // Slave Server ID
data[9] = 0; // Slave hostname length
data[10] = 0; // Slave username length
data[11] = 0; // Slave password length
encode_value(&data[12],
router->service->ports->port, 16); // Slave master port
encode_value(&data[14], 0, 32); // Replication rank
encode_value(&data[18], router->masterid, 32); // Master server-id
return buf;
}
@ -338,14 +367,16 @@ int len = 0x1b;
return NULL;
data = GWBUF_DATA(buf);
encode_value(&data[0], len,24); // Payload length
data[3] = 0; // Sequence ID
data[4] = COM_BINLOG_DUMP; // Command
encode_value(&data[5], router->binlog_position, 32); // binlog position
encode_value(&data[9], 0, 16); // Flags
encode_value(&data[11], router->serverid, 32); // Server-id of MaxScale
encode_value(&data[0], len,24); // Payload length
data[3] = 0; // Sequence ID
data[4] = COM_BINLOG_DUMP; // Command
encode_value(&data[5],
router->binlog_position, 32); // binlog position
encode_value(&data[9], 0, 16); // Flags
encode_value(&data[11],
router->serverid, 32); // Server-id of MaxScale
strncpy((char *)&data[15], router->binlog_name,
BINLOG_FNAMELEN); // binlog filename
BINLOG_FNAMELEN); // binlog filename
return buf;
}
@ -469,9 +500,14 @@ int no_residual = 1;
/*
* The message is not fully contained in the current
* and we do not have the complete message in the
* buffer chain. Therefore we must stop processing until
* we receive the next buffer.
* buffer chain. Therefore we must stop processing
* until we receive the next buffer.
*/
router->stats.n_residuals++;
LOGIF(LD,(skygw_log_write(
LOGFILE_DEBUG,
"Residual data left after %d records.\n",
router->stats.n_binlogs)));
break;
}
else
@ -486,13 +522,19 @@ int no_residual = 1;
if (hdr.event_size != len - 5)
{
printf("Packet length is %d, but event size is %d\n",
len, hdr.event_size);
abort();
LOGIF(LE,(skygw_log_write(
LOGFILE_ERROR,
"Packet length is %d, but event size is %d, "
"binlog file %s position %d",
len, hdr.event_size,
router->binlog_name,
router->binlog_position)));
break;
}
if (hdr.ok == 0)
{
router->stats.n_binlogs++;
router->lastEventReceived = hdr.event_type;
// #define SHOW_EVENTS
#ifdef SHOW_EVENTS
@ -526,7 +568,7 @@ int no_residual = 1;
#ifdef SHOW_EVENTS
printf("Replication heartbeat\n");
#endif
;
router->stats.n_heartbeats++;
}
else if (hdr.flags != LOG_EVENT_ARTIFICIAL_F)
{