MXS-1156: added limit to master connect retry
MXS-1156: added limit to master connect retry
This commit is contained in:
parent
3c7a0014a9
commit
c91187d4de
@ -326,7 +326,7 @@ createInstance(SERVICE *service, char **options)
|
||||
|
||||
inst->user = MXS_STRDUP_A(service->credentials.name);
|
||||
inst->password = MXS_STRDUP_A(service->credentials.authdata);
|
||||
inst->retry_backoff = 1;
|
||||
inst->retry_backoff = 0;
|
||||
inst->m_errno = 0;
|
||||
inst->m_errmsg = NULL;
|
||||
|
||||
@ -1584,6 +1584,8 @@ diagnostics(MXS_ROUTER *router, DCB *dcb)
|
||||
router_inst->stats.n_masterstarts);
|
||||
dcb_printf(dcb, "\tNumber of delayed reconnects: %d\n",
|
||||
router_inst->stats.n_delayedreconnects);
|
||||
dcb_printf(dcb, "\tNumber of connect retries: %d\n",
|
||||
router_inst->retry_backoff);
|
||||
dcb_printf(dcb, "\tCurrent binlog file: %s\n",
|
||||
router_inst->binlog_name);
|
||||
dcb_printf(dcb, "\tCurrent binlog position: %lu\n",
|
||||
|
@ -238,10 +238,12 @@ typedef enum
|
||||
/**
|
||||
* master reconnect backoff constants
|
||||
* BLR_MASTER_BACKOFF_TIME The increments of the back off time (seconds)
|
||||
* BLR_MAX_BACKOFF Maximum number of increments to backoff to
|
||||
* BLR_MASTER_CONNECT_RETRY The connect retry interval
|
||||
* BLR_BLR_MASTER_RETRY_COUNT Maximum value of retries
|
||||
*/
|
||||
#define BLR_MASTER_BACKOFF_TIME 10
|
||||
#define BLR_MAX_BACKOFF 60
|
||||
#define BLR_MASTER_BACKOFF_TIME 10
|
||||
#define BLR_MASTER_CONNECT_RETRY 60
|
||||
#define BLR_MASTER_RETRY_COUNT 1000
|
||||
|
||||
/* max size for error message returned to client */
|
||||
#define BINLOG_ERROR_MSG_LEN 700
|
||||
|
@ -144,10 +144,10 @@ extern int blr_write_special_event(ROUTER_INSTANCE *router,
|
||||
extern int blr_file_new_binlog(ROUTER_INSTANCE *router, char *file);
|
||||
static bool blr_handle_missing_files(ROUTER_INSTANCE *router,
|
||||
char *new_file);
|
||||
|
||||
static void worker_cb_start_master(int worker_id, void* data);
|
||||
extern bool blr_file_exists(ROUTER_INSTANCE *router);
|
||||
extern void blr_file_update_gtid(ROUTER_INSTANCE *router);
|
||||
static int blr_check_connect_retry(ROUTER_INSTANCE *router);
|
||||
|
||||
static int keepalive = 1;
|
||||
|
||||
@ -186,25 +186,53 @@ static void blr_start_master(void* data)
|
||||
|
||||
router->stats.n_binlogs_ses = 0;
|
||||
spinlock_acquire(&router->lock);
|
||||
|
||||
if (router->master_state != BLRM_UNCONNECTED)
|
||||
{
|
||||
if (router->master_state != BLRM_SLAVE_STOPPED)
|
||||
if (router->master_state != BLRM_SLAVE_STOPPED &&
|
||||
router->master_state != BLRM_CONNECTING)
|
||||
{
|
||||
MXS_ERROR("%s: Master Connect: Unexpected master state %s\n",
|
||||
MXS_ERROR("%s: Master Connect: Unexpected master state [%s]\n",
|
||||
router->service->name,
|
||||
blrm_states[router->master_state]);
|
||||
}
|
||||
else
|
||||
{
|
||||
MXS_NOTICE("%s: Master Connect: binlog state is %s\n",
|
||||
router->service->name,
|
||||
blrm_states[router->master_state]);
|
||||
MXS_NOTICE("%s: Master Connect: binlog current state is [%s]\n",
|
||||
router->service->name,
|
||||
blrm_states[router->master_state]);
|
||||
}
|
||||
|
||||
/* Return only if state is not BLRM_CONNECTING */
|
||||
if (router->master_state != BLRM_CONNECTING)
|
||||
{
|
||||
spinlock_release(&router->lock);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/* Check whether master connection can be started */
|
||||
int connect_retry;
|
||||
if ((connect_retry = blr_check_connect_retry(router)) == 0)
|
||||
{
|
||||
/* Force stopped state */
|
||||
router->master_state = BLRM_SLAVE_STOPPED;
|
||||
spinlock_release(&router->lock);
|
||||
|
||||
MXS_ERROR("%s: failure while connecting to master server '%s', "
|
||||
"reached %d maximum number of retries. "
|
||||
"Replication is stopped.",
|
||||
router->service->name,
|
||||
router->service->dbref->server->unique_name,
|
||||
BLR_MASTER_RETRY_COUNT);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Force connecting state */
|
||||
router->master_state = BLRM_CONNECTING;
|
||||
|
||||
/* Increment retry counter */
|
||||
router->retry_backoff++;
|
||||
spinlock_release(&router->lock);
|
||||
|
||||
DCB* client = dcb_alloc(DCB_ROLE_INTERNAL, NULL);
|
||||
@ -242,23 +270,24 @@ static void blr_start_master(void* data)
|
||||
router->session,
|
||||
BLR_PROTOCOL)) == NULL)
|
||||
{
|
||||
char *name = MXS_MALLOC(strlen(router->service->name) + strlen(" Master") + 1);
|
||||
|
||||
/* Set reconnection task */
|
||||
static const char master[] = "Master";
|
||||
char *name = MXS_MALLOC(strlen(router->service->name) + sizeof(master));
|
||||
if (name)
|
||||
{
|
||||
sprintf(name, "%s Master", router->service->name);
|
||||
sprintf(name, "%s %s", router->service->name, master);
|
||||
hktask_oneshot(name,
|
||||
blr_start_master_in_main,
|
||||
router,
|
||||
BLR_MASTER_BACKOFF_TIME * router->retry_backoff++);
|
||||
connect_retry);
|
||||
MXS_FREE(name);
|
||||
}
|
||||
if (router->retry_backoff > BLR_MAX_BACKOFF)
|
||||
{
|
||||
router->retry_backoff = BLR_MAX_BACKOFF;
|
||||
}
|
||||
MXS_ERROR("failed to connect to master server '%s'",
|
||||
router->service->dbref->server->unique_name);
|
||||
|
||||
MXS_ERROR("%s: failure while connecting to master server '%s', "
|
||||
"retrying in %d seconds",
|
||||
router->service->name,
|
||||
router->service->dbref->server->unique_name,
|
||||
connect_retry);
|
||||
return;
|
||||
}
|
||||
router->master->remote = MXS_STRDUP_A(router->service->dbref->server->name);
|
||||
@ -349,28 +378,55 @@ blr_restart_master(ROUTER_INSTANCE *router)
|
||||
spinlock_acquire(&router->lock);
|
||||
router->reconnect_pending = 0;
|
||||
router->active_logs = 0;
|
||||
spinlock_release(&router->lock);
|
||||
|
||||
if (router->master_state < BLRM_BINLOGDUMP)
|
||||
{
|
||||
router->master_state = BLRM_UNCONNECTED;
|
||||
int connect_retry;
|
||||
if ((connect_retry = blr_check_connect_retry(router)) == 0)
|
||||
{
|
||||
/* Force stopped state */
|
||||
router->master_state = BLRM_SLAVE_STOPPED;
|
||||
spinlock_release(&router->lock);
|
||||
|
||||
char *name = (char*)MXS_MALLOC(strlen(router->service->name) + strlen(" Master") + 1);
|
||||
MXS_ERROR("%s: failed to connect to master server '%s', "
|
||||
"reached %d maximum number of retries. "
|
||||
"Replication is stopped.",
|
||||
router->service->name,
|
||||
router->service->dbref->server->unique_name,
|
||||
BLR_MASTER_RETRY_COUNT);
|
||||
return;
|
||||
}
|
||||
|
||||
/* Force unconnected state */
|
||||
router->master_state = BLRM_UNCONNECTED;
|
||||
spinlock_release(&router->lock);
|
||||
|
||||
/* Set reconnection task */
|
||||
static const char master[] = "Master";
|
||||
char *name = MXS_MALLOC(strlen(router->service->name) + sizeof(master));
|
||||
|
||||
if (name)
|
||||
{
|
||||
sprintf(name, "%s Master", router->service->name);
|
||||
hktask_oneshot(name, blr_start_master_in_main, router,
|
||||
BLR_MASTER_BACKOFF_TIME * router->retry_backoff++);
|
||||
sprintf(name, "%s %s", router->service->name, master);
|
||||
hktask_oneshot(name,
|
||||
blr_start_master_in_main,
|
||||
router,
|
||||
connect_retry);
|
||||
MXS_FREE(name);
|
||||
}
|
||||
if (router->retry_backoff > BLR_MAX_BACKOFF)
|
||||
{
|
||||
router->retry_backoff = BLR_MAX_BACKOFF;
|
||||
|
||||
MXS_ERROR("%s: failed to connect to master server '%s', "
|
||||
"retrying in %d seconds",
|
||||
router->service->name,
|
||||
router->service->dbref->server->unique_name,
|
||||
connect_retry);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Force unconnected state */
|
||||
router->master_state = BLRM_UNCONNECTED;
|
||||
spinlock_release(&router->lock);
|
||||
|
||||
blr_start_master_in_main(router);
|
||||
}
|
||||
}
|
||||
@ -445,12 +501,16 @@ blr_master_close(ROUTER_INSTANCE *router)
|
||||
void
|
||||
blr_master_delayed_connect(ROUTER_INSTANCE *router)
|
||||
{
|
||||
char *name = (char*)MXS_MALLOC(strlen(router->service->name) + strlen(" Master Recovery") + 1);
|
||||
static const char master[] = "Master Recovery";
|
||||
char *name = (char*)MXS_MALLOC(strlen(router->service->name) + sizeof(master))
|
||||
|
||||
if (name)
|
||||
{
|
||||
sprintf(name, "%s Master Recovery", router->service->name);
|
||||
hktask_oneshot(name, blr_start_master_in_main, router, 60);
|
||||
sprintf(name, "%s %s", router->service->name, master);
|
||||
hktask_oneshot(name,
|
||||
blr_start_master_in_main,
|
||||
router,
|
||||
BLR_MASTER_CONNECT_RETRY);
|
||||
MXS_FREE(name);
|
||||
}
|
||||
}
|
||||
@ -2872,7 +2932,7 @@ static void blr_start_master_registration(ROUTER_INSTANCE *router, GWBUF *buf)
|
||||
blr_register_send_command(router,
|
||||
"SHOW VARIABLES LIKE 'SERVER_ID'",
|
||||
BLRM_SERVERID);
|
||||
router->retry_backoff = 1;
|
||||
router->retry_backoff = 0;
|
||||
break;
|
||||
case BLRM_SERVERID:
|
||||
// If set heartbeat is not being sent, next state is BLRM_HBPERIOD
|
||||
@ -3463,3 +3523,30 @@ static bool blr_handle_missing_files(ROUTER_INSTANCE *router,
|
||||
// Did nothing, just return true
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check the connection retry limit and increment
|
||||
* by BLR_MASTER_BACKOFF_TIME up to router->retry_interval.
|
||||
*
|
||||
* @param router The current router instance
|
||||
* @return The interval to use for next reconnect
|
||||
* or 0 if router->retry_limit has been hit.
|
||||
*/
|
||||
static int blr_check_connect_retry(ROUTER_INSTANCE *router)
|
||||
{
|
||||
/* Stop reconnection to master */
|
||||
if (router->retry_backoff >= BLR_MASTER_RETRY_COUNT)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Return the interval for next reconnect */
|
||||
if (router->retry_backoff >= BLR_MASTER_CONNECT_RETRY / BLR_MASTER_BACKOFF_TIME)
|
||||
{
|
||||
return BLR_MASTER_CONNECT_RETRY;
|
||||
}
|
||||
else
|
||||
{
|
||||
return BLR_MASTER_BACKOFF_TIME * (1 + router->retry_backoff);
|
||||
}
|
||||
}
|
||||
|
@ -3620,7 +3620,8 @@ blr_start_slave(ROUTER_INSTANCE* router, ROUTER_SLAVE* slave)
|
||||
|
||||
/* if running return an error */
|
||||
if (router->master_state != BLRM_UNCONNECTED &&
|
||||
router->master_state != BLRM_SLAVE_STOPPED)
|
||||
router->master_state != BLRM_SLAVE_STOPPED &&
|
||||
router->master_state != BLRM_CONNECTING)
|
||||
{
|
||||
blr_slave_send_warning_message(router,
|
||||
slave,
|
||||
@ -3631,6 +3632,7 @@ blr_start_slave(ROUTER_INSTANCE* router, ROUTER_SLAVE* slave)
|
||||
|
||||
spinlock_acquire(&router->lock);
|
||||
router->master_state = BLRM_UNCONNECTED;
|
||||
router->retry_backoff = 0;
|
||||
spinlock_release(&router->lock);
|
||||
|
||||
/**
|
||||
|
Loading…
x
Reference in New Issue
Block a user