MXS-1156: added limit to master connect retry

MXS-1156: added limit to master connect retry
This commit is contained in:
MassimilianoPinto 2017-08-29 18:31:17 +02:00
parent 3c7a0014a9
commit c91187d4de
4 changed files with 128 additions and 35 deletions

View File

@ -326,7 +326,7 @@ createInstance(SERVICE *service, char **options)
inst->user = MXS_STRDUP_A(service->credentials.name);
inst->password = MXS_STRDUP_A(service->credentials.authdata);
inst->retry_backoff = 1;
inst->retry_backoff = 0;
inst->m_errno = 0;
inst->m_errmsg = NULL;
@ -1584,6 +1584,8 @@ diagnostics(MXS_ROUTER *router, DCB *dcb)
router_inst->stats.n_masterstarts);
dcb_printf(dcb, "\tNumber of delayed reconnects: %d\n",
router_inst->stats.n_delayedreconnects);
dcb_printf(dcb, "\tNumber of connect retries: %d\n",
router_inst->retry_backoff);
dcb_printf(dcb, "\tCurrent binlog file: %s\n",
router_inst->binlog_name);
dcb_printf(dcb, "\tCurrent binlog position: %lu\n",

View File

@ -238,10 +238,12 @@ typedef enum
/**
* master reconnect backoff constants
* BLR_MASTER_BACKOFF_TIME The increments of the back off time (seconds)
* BLR_MAX_BACKOFF Maximum number of increments to backoff to
* BLR_MASTER_CONNECT_RETRY The connect retry interval
* BLR_BLR_MASTER_RETRY_COUNT Maximum value of retries
*/
#define BLR_MASTER_BACKOFF_TIME 10
#define BLR_MAX_BACKOFF 60
#define BLR_MASTER_BACKOFF_TIME 10
#define BLR_MASTER_CONNECT_RETRY 60
#define BLR_MASTER_RETRY_COUNT 1000
/* max size for error message returned to client */
#define BINLOG_ERROR_MSG_LEN 700

View File

@ -144,10 +144,10 @@ extern int blr_write_special_event(ROUTER_INSTANCE *router,
extern int blr_file_new_binlog(ROUTER_INSTANCE *router, char *file);
static bool blr_handle_missing_files(ROUTER_INSTANCE *router,
char *new_file);
static void worker_cb_start_master(int worker_id, void* data);
extern bool blr_file_exists(ROUTER_INSTANCE *router);
extern void blr_file_update_gtid(ROUTER_INSTANCE *router);
static int blr_check_connect_retry(ROUTER_INSTANCE *router);
static int keepalive = 1;
@ -186,25 +186,53 @@ static void blr_start_master(void* data)
router->stats.n_binlogs_ses = 0;
spinlock_acquire(&router->lock);
if (router->master_state != BLRM_UNCONNECTED)
{
if (router->master_state != BLRM_SLAVE_STOPPED)
if (router->master_state != BLRM_SLAVE_STOPPED &&
router->master_state != BLRM_CONNECTING)
{
MXS_ERROR("%s: Master Connect: Unexpected master state %s\n",
MXS_ERROR("%s: Master Connect: Unexpected master state [%s]\n",
router->service->name,
blrm_states[router->master_state]);
}
else
{
MXS_NOTICE("%s: Master Connect: binlog state is %s\n",
router->service->name,
blrm_states[router->master_state]);
MXS_NOTICE("%s: Master Connect: binlog current state is [%s]\n",
router->service->name,
blrm_states[router->master_state]);
}
/* Return only if state is not BLRM_CONNECTING */
if (router->master_state != BLRM_CONNECTING)
{
spinlock_release(&router->lock);
return;
}
}
/* Check whether master connection can be started */
int connect_retry;
if ((connect_retry = blr_check_connect_retry(router)) == 0)
{
/* Force stopped state */
router->master_state = BLRM_SLAVE_STOPPED;
spinlock_release(&router->lock);
MXS_ERROR("%s: failure while connecting to master server '%s', "
"reached %d maximum number of retries. "
"Replication is stopped.",
router->service->name,
router->service->dbref->server->unique_name,
BLR_MASTER_RETRY_COUNT);
return;
}
/* Force connecting state */
router->master_state = BLRM_CONNECTING;
/* Increment retry counter */
router->retry_backoff++;
spinlock_release(&router->lock);
DCB* client = dcb_alloc(DCB_ROLE_INTERNAL, NULL);
@ -242,23 +270,24 @@ static void blr_start_master(void* data)
router->session,
BLR_PROTOCOL)) == NULL)
{
char *name = MXS_MALLOC(strlen(router->service->name) + strlen(" Master") + 1);
/* Set reconnection task */
static const char master[] = "Master";
char *name = MXS_MALLOC(strlen(router->service->name) + sizeof(master));
if (name)
{
sprintf(name, "%s Master", router->service->name);
sprintf(name, "%s %s", router->service->name, master);
hktask_oneshot(name,
blr_start_master_in_main,
router,
BLR_MASTER_BACKOFF_TIME * router->retry_backoff++);
connect_retry);
MXS_FREE(name);
}
if (router->retry_backoff > BLR_MAX_BACKOFF)
{
router->retry_backoff = BLR_MAX_BACKOFF;
}
MXS_ERROR("failed to connect to master server '%s'",
router->service->dbref->server->unique_name);
MXS_ERROR("%s: failure while connecting to master server '%s', "
"retrying in %d seconds",
router->service->name,
router->service->dbref->server->unique_name,
connect_retry);
return;
}
router->master->remote = MXS_STRDUP_A(router->service->dbref->server->name);
@ -349,28 +378,55 @@ blr_restart_master(ROUTER_INSTANCE *router)
spinlock_acquire(&router->lock);
router->reconnect_pending = 0;
router->active_logs = 0;
spinlock_release(&router->lock);
if (router->master_state < BLRM_BINLOGDUMP)
{
router->master_state = BLRM_UNCONNECTED;
int connect_retry;
if ((connect_retry = blr_check_connect_retry(router)) == 0)
{
/* Force stopped state */
router->master_state = BLRM_SLAVE_STOPPED;
spinlock_release(&router->lock);
char *name = (char*)MXS_MALLOC(strlen(router->service->name) + strlen(" Master") + 1);
MXS_ERROR("%s: failed to connect to master server '%s', "
"reached %d maximum number of retries. "
"Replication is stopped.",
router->service->name,
router->service->dbref->server->unique_name,
BLR_MASTER_RETRY_COUNT);
return;
}
/* Force unconnected state */
router->master_state = BLRM_UNCONNECTED;
spinlock_release(&router->lock);
/* Set reconnection task */
static const char master[] = "Master";
char *name = MXS_MALLOC(strlen(router->service->name) + sizeof(master));
if (name)
{
sprintf(name, "%s Master", router->service->name);
hktask_oneshot(name, blr_start_master_in_main, router,
BLR_MASTER_BACKOFF_TIME * router->retry_backoff++);
sprintf(name, "%s %s", router->service->name, master);
hktask_oneshot(name,
blr_start_master_in_main,
router,
connect_retry);
MXS_FREE(name);
}
if (router->retry_backoff > BLR_MAX_BACKOFF)
{
router->retry_backoff = BLR_MAX_BACKOFF;
MXS_ERROR("%s: failed to connect to master server '%s', "
"retrying in %d seconds",
router->service->name,
router->service->dbref->server->unique_name,
connect_retry);
}
}
else
{
/* Force unconnected state */
router->master_state = BLRM_UNCONNECTED;
spinlock_release(&router->lock);
blr_start_master_in_main(router);
}
}
@ -445,12 +501,16 @@ blr_master_close(ROUTER_INSTANCE *router)
void
blr_master_delayed_connect(ROUTER_INSTANCE *router)
{
char *name = (char*)MXS_MALLOC(strlen(router->service->name) + strlen(" Master Recovery") + 1);
static const char master[] = "Master Recovery";
char *name = (char*)MXS_MALLOC(strlen(router->service->name) + sizeof(master))
if (name)
{
sprintf(name, "%s Master Recovery", router->service->name);
hktask_oneshot(name, blr_start_master_in_main, router, 60);
sprintf(name, "%s %s", router->service->name, master);
hktask_oneshot(name,
blr_start_master_in_main,
router,
BLR_MASTER_CONNECT_RETRY);
MXS_FREE(name);
}
}
@ -2872,7 +2932,7 @@ static void blr_start_master_registration(ROUTER_INSTANCE *router, GWBUF *buf)
blr_register_send_command(router,
"SHOW VARIABLES LIKE 'SERVER_ID'",
BLRM_SERVERID);
router->retry_backoff = 1;
router->retry_backoff = 0;
break;
case BLRM_SERVERID:
// If set heartbeat is not being sent, next state is BLRM_HBPERIOD
@ -3463,3 +3523,30 @@ static bool blr_handle_missing_files(ROUTER_INSTANCE *router,
// Did nothing, just return true
return true;
}
/**
* Check the connection retry limit and increment
* by BLR_MASTER_BACKOFF_TIME up to router->retry_interval.
*
* @param router The current router instance
* @return The interval to use for next reconnect
* or 0 if router->retry_limit has been hit.
*/
static int blr_check_connect_retry(ROUTER_INSTANCE *router)
{
/* Stop reconnection to master */
if (router->retry_backoff >= BLR_MASTER_RETRY_COUNT)
{
return 0;
}
/* Return the interval for next reconnect */
if (router->retry_backoff >= BLR_MASTER_CONNECT_RETRY / BLR_MASTER_BACKOFF_TIME)
{
return BLR_MASTER_CONNECT_RETRY;
}
else
{
return BLR_MASTER_BACKOFF_TIME * (1 + router->retry_backoff);
}
}

View File

@ -3620,7 +3620,8 @@ blr_start_slave(ROUTER_INSTANCE* router, ROUTER_SLAVE* slave)
/* if running return an error */
if (router->master_state != BLRM_UNCONNECTED &&
router->master_state != BLRM_SLAVE_STOPPED)
router->master_state != BLRM_SLAVE_STOPPED &&
router->master_state != BLRM_CONNECTING)
{
blr_slave_send_warning_message(router,
slave,
@ -3631,6 +3632,7 @@ blr_start_slave(ROUTER_INSTANCE* router, ROUTER_SLAVE* slave)
spinlock_acquire(&router->lock);
router->master_state = BLRM_UNCONNECTED;
router->retry_backoff = 0;
spinlock_release(&router->lock);
/**