From c91187d4deebbb1e12700b8f9c4320ab3898e668 Mon Sep 17 00:00:00 2001 From: MassimilianoPinto Date: Tue, 29 Aug 2017 18:31:17 +0200 Subject: [PATCH] MXS-1156: added limit to master connect retry MXS-1156: added limit to master connect retry --- server/modules/routing/binlogrouter/blr.c | 4 +- server/modules/routing/binlogrouter/blr.h | 8 +- .../modules/routing/binlogrouter/blr_master.c | 147 ++++++++++++++---- .../modules/routing/binlogrouter/blr_slave.c | 4 +- 4 files changed, 128 insertions(+), 35 deletions(-) diff --git a/server/modules/routing/binlogrouter/blr.c b/server/modules/routing/binlogrouter/blr.c index 4ce459a77..8927561b3 100644 --- a/server/modules/routing/binlogrouter/blr.c +++ b/server/modules/routing/binlogrouter/blr.c @@ -326,7 +326,7 @@ createInstance(SERVICE *service, char **options) inst->user = MXS_STRDUP_A(service->credentials.name); inst->password = MXS_STRDUP_A(service->credentials.authdata); - inst->retry_backoff = 1; + inst->retry_backoff = 0; inst->m_errno = 0; inst->m_errmsg = NULL; @@ -1584,6 +1584,8 @@ diagnostics(MXS_ROUTER *router, DCB *dcb) router_inst->stats.n_masterstarts); dcb_printf(dcb, "\tNumber of delayed reconnects: %d\n", router_inst->stats.n_delayedreconnects); + dcb_printf(dcb, "\tNumber of connect retries: %d\n", + router_inst->retry_backoff); dcb_printf(dcb, "\tCurrent binlog file: %s\n", router_inst->binlog_name); dcb_printf(dcb, "\tCurrent binlog position: %lu\n", diff --git a/server/modules/routing/binlogrouter/blr.h b/server/modules/routing/binlogrouter/blr.h index 2020c8e53..33404cd04 100644 --- a/server/modules/routing/binlogrouter/blr.h +++ b/server/modules/routing/binlogrouter/blr.h @@ -238,10 +238,12 @@ typedef enum /** * master reconnect backoff constants * BLR_MASTER_BACKOFF_TIME The increments of the back off time (seconds) - * BLR_MAX_BACKOFF Maximum number of increments to backoff to + * BLR_MASTER_CONNECT_RETRY The connect retry interval + * BLR_BLR_MASTER_RETRY_COUNT Maximum value of retries */ -#define BLR_MASTER_BACKOFF_TIME 10 -#define BLR_MAX_BACKOFF 60 +#define BLR_MASTER_BACKOFF_TIME 10 +#define BLR_MASTER_CONNECT_RETRY 60 +#define BLR_MASTER_RETRY_COUNT 1000 /* max size for error message returned to client */ #define BINLOG_ERROR_MSG_LEN 700 diff --git a/server/modules/routing/binlogrouter/blr_master.c b/server/modules/routing/binlogrouter/blr_master.c index 3bd9404ed..8aa48d23a 100644 --- a/server/modules/routing/binlogrouter/blr_master.c +++ b/server/modules/routing/binlogrouter/blr_master.c @@ -144,10 +144,10 @@ extern int blr_write_special_event(ROUTER_INSTANCE *router, extern int blr_file_new_binlog(ROUTER_INSTANCE *router, char *file); static bool blr_handle_missing_files(ROUTER_INSTANCE *router, char *new_file); - static void worker_cb_start_master(int worker_id, void* data); extern bool blr_file_exists(ROUTER_INSTANCE *router); extern void blr_file_update_gtid(ROUTER_INSTANCE *router); +static int blr_check_connect_retry(ROUTER_INSTANCE *router); static int keepalive = 1; @@ -186,25 +186,53 @@ static void blr_start_master(void* data) router->stats.n_binlogs_ses = 0; spinlock_acquire(&router->lock); + if (router->master_state != BLRM_UNCONNECTED) { - if (router->master_state != BLRM_SLAVE_STOPPED) + if (router->master_state != BLRM_SLAVE_STOPPED && + router->master_state != BLRM_CONNECTING) { - MXS_ERROR("%s: Master Connect: Unexpected master state %s\n", + MXS_ERROR("%s: Master Connect: Unexpected master state [%s]\n", router->service->name, blrm_states[router->master_state]); } else { - MXS_NOTICE("%s: Master Connect: binlog state is %s\n", - router->service->name, - blrm_states[router->master_state]); + MXS_NOTICE("%s: Master Connect: binlog current state is [%s]\n", + router->service->name, + blrm_states[router->master_state]); } + + /* Return only if state is not BLRM_CONNECTING */ + if (router->master_state != BLRM_CONNECTING) + { + spinlock_release(&router->lock); + return; + } + } + + /* Check whether master connection can be started */ + int connect_retry; + if ((connect_retry = blr_check_connect_retry(router)) == 0) + { + /* Force stopped state */ + router->master_state = BLRM_SLAVE_STOPPED; spinlock_release(&router->lock); + + MXS_ERROR("%s: failure while connecting to master server '%s', " + "reached %d maximum number of retries. " + "Replication is stopped.", + router->service->name, + router->service->dbref->server->unique_name, + BLR_MASTER_RETRY_COUNT); return; } + + /* Force connecting state */ router->master_state = BLRM_CONNECTING; + /* Increment retry counter */ + router->retry_backoff++; spinlock_release(&router->lock); DCB* client = dcb_alloc(DCB_ROLE_INTERNAL, NULL); @@ -242,23 +270,24 @@ static void blr_start_master(void* data) router->session, BLR_PROTOCOL)) == NULL) { - char *name = MXS_MALLOC(strlen(router->service->name) + strlen(" Master") + 1); - + /* Set reconnection task */ + static const char master[] = "Master"; + char *name = MXS_MALLOC(strlen(router->service->name) + sizeof(master)); if (name) { - sprintf(name, "%s Master", router->service->name); + sprintf(name, "%s %s", router->service->name, master); hktask_oneshot(name, blr_start_master_in_main, router, - BLR_MASTER_BACKOFF_TIME * router->retry_backoff++); + connect_retry); MXS_FREE(name); } - if (router->retry_backoff > BLR_MAX_BACKOFF) - { - router->retry_backoff = BLR_MAX_BACKOFF; - } - MXS_ERROR("failed to connect to master server '%s'", - router->service->dbref->server->unique_name); + + MXS_ERROR("%s: failure while connecting to master server '%s', " + "retrying in %d seconds", + router->service->name, + router->service->dbref->server->unique_name, + connect_retry); return; } router->master->remote = MXS_STRDUP_A(router->service->dbref->server->name); @@ -349,28 +378,55 @@ blr_restart_master(ROUTER_INSTANCE *router) spinlock_acquire(&router->lock); router->reconnect_pending = 0; router->active_logs = 0; - spinlock_release(&router->lock); + if (router->master_state < BLRM_BINLOGDUMP) { - router->master_state = BLRM_UNCONNECTED; + int connect_retry; + if ((connect_retry = blr_check_connect_retry(router)) == 0) + { + /* Force stopped state */ + router->master_state = BLRM_SLAVE_STOPPED; + spinlock_release(&router->lock); - char *name = (char*)MXS_MALLOC(strlen(router->service->name) + strlen(" Master") + 1); + MXS_ERROR("%s: failed to connect to master server '%s', " + "reached %d maximum number of retries. " + "Replication is stopped.", + router->service->name, + router->service->dbref->server->unique_name, + BLR_MASTER_RETRY_COUNT); + return; + } + + /* Force unconnected state */ + router->master_state = BLRM_UNCONNECTED; + spinlock_release(&router->lock); + + /* Set reconnection task */ + static const char master[] = "Master"; + char *name = MXS_MALLOC(strlen(router->service->name) + sizeof(master)); if (name) { - sprintf(name, "%s Master", router->service->name); - hktask_oneshot(name, blr_start_master_in_main, router, - BLR_MASTER_BACKOFF_TIME * router->retry_backoff++); + sprintf(name, "%s %s", router->service->name, master); + hktask_oneshot(name, + blr_start_master_in_main, + router, + connect_retry); MXS_FREE(name); - } - if (router->retry_backoff > BLR_MAX_BACKOFF) - { - router->retry_backoff = BLR_MAX_BACKOFF; + + MXS_ERROR("%s: failed to connect to master server '%s', " + "retrying in %d seconds", + router->service->name, + router->service->dbref->server->unique_name, + connect_retry); } } else { + /* Force unconnected state */ router->master_state = BLRM_UNCONNECTED; + spinlock_release(&router->lock); + blr_start_master_in_main(router); } } @@ -445,12 +501,16 @@ blr_master_close(ROUTER_INSTANCE *router) void blr_master_delayed_connect(ROUTER_INSTANCE *router) { - char *name = (char*)MXS_MALLOC(strlen(router->service->name) + strlen(" Master Recovery") + 1); + static const char master[] = "Master Recovery"; + char *name = (char*)MXS_MALLOC(strlen(router->service->name) + sizeof(master)) if (name) { - sprintf(name, "%s Master Recovery", router->service->name); - hktask_oneshot(name, blr_start_master_in_main, router, 60); + sprintf(name, "%s %s", router->service->name, master); + hktask_oneshot(name, + blr_start_master_in_main, + router, + BLR_MASTER_CONNECT_RETRY); MXS_FREE(name); } } @@ -2872,7 +2932,7 @@ static void blr_start_master_registration(ROUTER_INSTANCE *router, GWBUF *buf) blr_register_send_command(router, "SHOW VARIABLES LIKE 'SERVER_ID'", BLRM_SERVERID); - router->retry_backoff = 1; + router->retry_backoff = 0; break; case BLRM_SERVERID: // If set heartbeat is not being sent, next state is BLRM_HBPERIOD @@ -3463,3 +3523,30 @@ static bool blr_handle_missing_files(ROUTER_INSTANCE *router, // Did nothing, just return true return true; } + +/** + * Check the connection retry limit and increment + * by BLR_MASTER_BACKOFF_TIME up to router->retry_interval. + * + * @param router The current router instance + * @return The interval to use for next reconnect + * or 0 if router->retry_limit has been hit. + */ +static int blr_check_connect_retry(ROUTER_INSTANCE *router) +{ + /* Stop reconnection to master */ + if (router->retry_backoff >= BLR_MASTER_RETRY_COUNT) + { + return 0; + } + + /* Return the interval for next reconnect */ + if (router->retry_backoff >= BLR_MASTER_CONNECT_RETRY / BLR_MASTER_BACKOFF_TIME) + { + return BLR_MASTER_CONNECT_RETRY; + } + else + { + return BLR_MASTER_BACKOFF_TIME * (1 + router->retry_backoff); + } +} diff --git a/server/modules/routing/binlogrouter/blr_slave.c b/server/modules/routing/binlogrouter/blr_slave.c index 7df86d36c..32b5421d9 100644 --- a/server/modules/routing/binlogrouter/blr_slave.c +++ b/server/modules/routing/binlogrouter/blr_slave.c @@ -3620,7 +3620,8 @@ blr_start_slave(ROUTER_INSTANCE* router, ROUTER_SLAVE* slave) /* if running return an error */ if (router->master_state != BLRM_UNCONNECTED && - router->master_state != BLRM_SLAVE_STOPPED) + router->master_state != BLRM_SLAVE_STOPPED && + router->master_state != BLRM_CONNECTING) { blr_slave_send_warning_message(router, slave, @@ -3631,6 +3632,7 @@ blr_start_slave(ROUTER_INSTANCE* router, ROUTER_SLAVE* slave) spinlock_acquire(&router->lock); router->master_state = BLRM_UNCONNECTED; + router->retry_backoff = 0; spinlock_release(&router->lock); /**