MXS-1156: added limit to master connect retry
MXS-1156: added limit to master connect retry
This commit is contained in:
@ -326,7 +326,7 @@ createInstance(SERVICE *service, char **options)
|
|||||||
|
|
||||||
inst->user = MXS_STRDUP_A(service->credentials.name);
|
inst->user = MXS_STRDUP_A(service->credentials.name);
|
||||||
inst->password = MXS_STRDUP_A(service->credentials.authdata);
|
inst->password = MXS_STRDUP_A(service->credentials.authdata);
|
||||||
inst->retry_backoff = 1;
|
inst->retry_backoff = 0;
|
||||||
inst->m_errno = 0;
|
inst->m_errno = 0;
|
||||||
inst->m_errmsg = NULL;
|
inst->m_errmsg = NULL;
|
||||||
|
|
||||||
@ -1584,6 +1584,8 @@ diagnostics(MXS_ROUTER *router, DCB *dcb)
|
|||||||
router_inst->stats.n_masterstarts);
|
router_inst->stats.n_masterstarts);
|
||||||
dcb_printf(dcb, "\tNumber of delayed reconnects: %d\n",
|
dcb_printf(dcb, "\tNumber of delayed reconnects: %d\n",
|
||||||
router_inst->stats.n_delayedreconnects);
|
router_inst->stats.n_delayedreconnects);
|
||||||
|
dcb_printf(dcb, "\tNumber of connect retries: %d\n",
|
||||||
|
router_inst->retry_backoff);
|
||||||
dcb_printf(dcb, "\tCurrent binlog file: %s\n",
|
dcb_printf(dcb, "\tCurrent binlog file: %s\n",
|
||||||
router_inst->binlog_name);
|
router_inst->binlog_name);
|
||||||
dcb_printf(dcb, "\tCurrent binlog position: %lu\n",
|
dcb_printf(dcb, "\tCurrent binlog position: %lu\n",
|
||||||
|
|||||||
@ -238,10 +238,12 @@ typedef enum
|
|||||||
/**
|
/**
|
||||||
* master reconnect backoff constants
|
* master reconnect backoff constants
|
||||||
* BLR_MASTER_BACKOFF_TIME The increments of the back off time (seconds)
|
* BLR_MASTER_BACKOFF_TIME The increments of the back off time (seconds)
|
||||||
* BLR_MAX_BACKOFF Maximum number of increments to backoff to
|
* BLR_MASTER_CONNECT_RETRY The connect retry interval
|
||||||
|
* BLR_BLR_MASTER_RETRY_COUNT Maximum value of retries
|
||||||
*/
|
*/
|
||||||
#define BLR_MASTER_BACKOFF_TIME 10
|
#define BLR_MASTER_BACKOFF_TIME 10
|
||||||
#define BLR_MAX_BACKOFF 60
|
#define BLR_MASTER_CONNECT_RETRY 60
|
||||||
|
#define BLR_MASTER_RETRY_COUNT 1000
|
||||||
|
|
||||||
/* max size for error message returned to client */
|
/* max size for error message returned to client */
|
||||||
#define BINLOG_ERROR_MSG_LEN 700
|
#define BINLOG_ERROR_MSG_LEN 700
|
||||||
|
|||||||
@ -144,10 +144,10 @@ extern int blr_write_special_event(ROUTER_INSTANCE *router,
|
|||||||
extern int blr_file_new_binlog(ROUTER_INSTANCE *router, char *file);
|
extern int blr_file_new_binlog(ROUTER_INSTANCE *router, char *file);
|
||||||
static bool blr_handle_missing_files(ROUTER_INSTANCE *router,
|
static bool blr_handle_missing_files(ROUTER_INSTANCE *router,
|
||||||
char *new_file);
|
char *new_file);
|
||||||
|
|
||||||
static void worker_cb_start_master(int worker_id, void* data);
|
static void worker_cb_start_master(int worker_id, void* data);
|
||||||
extern bool blr_file_exists(ROUTER_INSTANCE *router);
|
extern bool blr_file_exists(ROUTER_INSTANCE *router);
|
||||||
extern void blr_file_update_gtid(ROUTER_INSTANCE *router);
|
extern void blr_file_update_gtid(ROUTER_INSTANCE *router);
|
||||||
|
static int blr_check_connect_retry(ROUTER_INSTANCE *router);
|
||||||
|
|
||||||
static int keepalive = 1;
|
static int keepalive = 1;
|
||||||
|
|
||||||
@ -186,25 +186,53 @@ static void blr_start_master(void* data)
|
|||||||
|
|
||||||
router->stats.n_binlogs_ses = 0;
|
router->stats.n_binlogs_ses = 0;
|
||||||
spinlock_acquire(&router->lock);
|
spinlock_acquire(&router->lock);
|
||||||
|
|
||||||
if (router->master_state != BLRM_UNCONNECTED)
|
if (router->master_state != BLRM_UNCONNECTED)
|
||||||
{
|
{
|
||||||
if (router->master_state != BLRM_SLAVE_STOPPED)
|
if (router->master_state != BLRM_SLAVE_STOPPED &&
|
||||||
|
router->master_state != BLRM_CONNECTING)
|
||||||
{
|
{
|
||||||
MXS_ERROR("%s: Master Connect: Unexpected master state %s\n",
|
MXS_ERROR("%s: Master Connect: Unexpected master state [%s]\n",
|
||||||
router->service->name,
|
router->service->name,
|
||||||
blrm_states[router->master_state]);
|
blrm_states[router->master_state]);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
MXS_NOTICE("%s: Master Connect: binlog state is %s\n",
|
MXS_NOTICE("%s: Master Connect: binlog current state is [%s]\n",
|
||||||
router->service->name,
|
router->service->name,
|
||||||
blrm_states[router->master_state]);
|
blrm_states[router->master_state]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Return only if state is not BLRM_CONNECTING */
|
||||||
|
if (router->master_state != BLRM_CONNECTING)
|
||||||
|
{
|
||||||
|
spinlock_release(&router->lock);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Check whether master connection can be started */
|
||||||
|
int connect_retry;
|
||||||
|
if ((connect_retry = blr_check_connect_retry(router)) == 0)
|
||||||
|
{
|
||||||
|
/* Force stopped state */
|
||||||
|
router->master_state = BLRM_SLAVE_STOPPED;
|
||||||
spinlock_release(&router->lock);
|
spinlock_release(&router->lock);
|
||||||
|
|
||||||
|
MXS_ERROR("%s: failure while connecting to master server '%s', "
|
||||||
|
"reached %d maximum number of retries. "
|
||||||
|
"Replication is stopped.",
|
||||||
|
router->service->name,
|
||||||
|
router->service->dbref->server->unique_name,
|
||||||
|
BLR_MASTER_RETRY_COUNT);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Force connecting state */
|
||||||
router->master_state = BLRM_CONNECTING;
|
router->master_state = BLRM_CONNECTING;
|
||||||
|
|
||||||
|
/* Increment retry counter */
|
||||||
|
router->retry_backoff++;
|
||||||
spinlock_release(&router->lock);
|
spinlock_release(&router->lock);
|
||||||
|
|
||||||
DCB* client = dcb_alloc(DCB_ROLE_INTERNAL, NULL);
|
DCB* client = dcb_alloc(DCB_ROLE_INTERNAL, NULL);
|
||||||
@ -242,23 +270,24 @@ static void blr_start_master(void* data)
|
|||||||
router->session,
|
router->session,
|
||||||
BLR_PROTOCOL)) == NULL)
|
BLR_PROTOCOL)) == NULL)
|
||||||
{
|
{
|
||||||
char *name = MXS_MALLOC(strlen(router->service->name) + strlen(" Master") + 1);
|
/* Set reconnection task */
|
||||||
|
static const char master[] = "Master";
|
||||||
|
char *name = MXS_MALLOC(strlen(router->service->name) + sizeof(master));
|
||||||
if (name)
|
if (name)
|
||||||
{
|
{
|
||||||
sprintf(name, "%s Master", router->service->name);
|
sprintf(name, "%s %s", router->service->name, master);
|
||||||
hktask_oneshot(name,
|
hktask_oneshot(name,
|
||||||
blr_start_master_in_main,
|
blr_start_master_in_main,
|
||||||
router,
|
router,
|
||||||
BLR_MASTER_BACKOFF_TIME * router->retry_backoff++);
|
connect_retry);
|
||||||
MXS_FREE(name);
|
MXS_FREE(name);
|
||||||
}
|
}
|
||||||
if (router->retry_backoff > BLR_MAX_BACKOFF)
|
|
||||||
{
|
MXS_ERROR("%s: failure while connecting to master server '%s', "
|
||||||
router->retry_backoff = BLR_MAX_BACKOFF;
|
"retrying in %d seconds",
|
||||||
}
|
router->service->name,
|
||||||
MXS_ERROR("failed to connect to master server '%s'",
|
router->service->dbref->server->unique_name,
|
||||||
router->service->dbref->server->unique_name);
|
connect_retry);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
router->master->remote = MXS_STRDUP_A(router->service->dbref->server->name);
|
router->master->remote = MXS_STRDUP_A(router->service->dbref->server->name);
|
||||||
@ -349,28 +378,55 @@ blr_restart_master(ROUTER_INSTANCE *router)
|
|||||||
spinlock_acquire(&router->lock);
|
spinlock_acquire(&router->lock);
|
||||||
router->reconnect_pending = 0;
|
router->reconnect_pending = 0;
|
||||||
router->active_logs = 0;
|
router->active_logs = 0;
|
||||||
spinlock_release(&router->lock);
|
|
||||||
if (router->master_state < BLRM_BINLOGDUMP)
|
if (router->master_state < BLRM_BINLOGDUMP)
|
||||||
{
|
{
|
||||||
router->master_state = BLRM_UNCONNECTED;
|
int connect_retry;
|
||||||
|
if ((connect_retry = blr_check_connect_retry(router)) == 0)
|
||||||
|
{
|
||||||
|
/* Force stopped state */
|
||||||
|
router->master_state = BLRM_SLAVE_STOPPED;
|
||||||
|
spinlock_release(&router->lock);
|
||||||
|
|
||||||
char *name = (char*)MXS_MALLOC(strlen(router->service->name) + strlen(" Master") + 1);
|
MXS_ERROR("%s: failed to connect to master server '%s', "
|
||||||
|
"reached %d maximum number of retries. "
|
||||||
|
"Replication is stopped.",
|
||||||
|
router->service->name,
|
||||||
|
router->service->dbref->server->unique_name,
|
||||||
|
BLR_MASTER_RETRY_COUNT);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Force unconnected state */
|
||||||
|
router->master_state = BLRM_UNCONNECTED;
|
||||||
|
spinlock_release(&router->lock);
|
||||||
|
|
||||||
|
/* Set reconnection task */
|
||||||
|
static const char master[] = "Master";
|
||||||
|
char *name = MXS_MALLOC(strlen(router->service->name) + sizeof(master));
|
||||||
|
|
||||||
if (name)
|
if (name)
|
||||||
{
|
{
|
||||||
sprintf(name, "%s Master", router->service->name);
|
sprintf(name, "%s %s", router->service->name, master);
|
||||||
hktask_oneshot(name, blr_start_master_in_main, router,
|
hktask_oneshot(name,
|
||||||
BLR_MASTER_BACKOFF_TIME * router->retry_backoff++);
|
blr_start_master_in_main,
|
||||||
|
router,
|
||||||
|
connect_retry);
|
||||||
MXS_FREE(name);
|
MXS_FREE(name);
|
||||||
}
|
|
||||||
if (router->retry_backoff > BLR_MAX_BACKOFF)
|
MXS_ERROR("%s: failed to connect to master server '%s', "
|
||||||
{
|
"retrying in %d seconds",
|
||||||
router->retry_backoff = BLR_MAX_BACKOFF;
|
router->service->name,
|
||||||
|
router->service->dbref->server->unique_name,
|
||||||
|
connect_retry);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
/* Force unconnected state */
|
||||||
router->master_state = BLRM_UNCONNECTED;
|
router->master_state = BLRM_UNCONNECTED;
|
||||||
|
spinlock_release(&router->lock);
|
||||||
|
|
||||||
blr_start_master_in_main(router);
|
blr_start_master_in_main(router);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -445,12 +501,16 @@ blr_master_close(ROUTER_INSTANCE *router)
|
|||||||
void
|
void
|
||||||
blr_master_delayed_connect(ROUTER_INSTANCE *router)
|
blr_master_delayed_connect(ROUTER_INSTANCE *router)
|
||||||
{
|
{
|
||||||
char *name = (char*)MXS_MALLOC(strlen(router->service->name) + strlen(" Master Recovery") + 1);
|
static const char master[] = "Master Recovery";
|
||||||
|
char *name = (char*)MXS_MALLOC(strlen(router->service->name) + sizeof(master))
|
||||||
|
|
||||||
if (name)
|
if (name)
|
||||||
{
|
{
|
||||||
sprintf(name, "%s Master Recovery", router->service->name);
|
sprintf(name, "%s %s", router->service->name, master);
|
||||||
hktask_oneshot(name, blr_start_master_in_main, router, 60);
|
hktask_oneshot(name,
|
||||||
|
blr_start_master_in_main,
|
||||||
|
router,
|
||||||
|
BLR_MASTER_CONNECT_RETRY);
|
||||||
MXS_FREE(name);
|
MXS_FREE(name);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -2872,7 +2932,7 @@ static void blr_start_master_registration(ROUTER_INSTANCE *router, GWBUF *buf)
|
|||||||
blr_register_send_command(router,
|
blr_register_send_command(router,
|
||||||
"SHOW VARIABLES LIKE 'SERVER_ID'",
|
"SHOW VARIABLES LIKE 'SERVER_ID'",
|
||||||
BLRM_SERVERID);
|
BLRM_SERVERID);
|
||||||
router->retry_backoff = 1;
|
router->retry_backoff = 0;
|
||||||
break;
|
break;
|
||||||
case BLRM_SERVERID:
|
case BLRM_SERVERID:
|
||||||
// If set heartbeat is not being sent, next state is BLRM_HBPERIOD
|
// If set heartbeat is not being sent, next state is BLRM_HBPERIOD
|
||||||
@ -3463,3 +3523,30 @@ static bool blr_handle_missing_files(ROUTER_INSTANCE *router,
|
|||||||
// Did nothing, just return true
|
// Did nothing, just return true
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check the connection retry limit and increment
|
||||||
|
* by BLR_MASTER_BACKOFF_TIME up to router->retry_interval.
|
||||||
|
*
|
||||||
|
* @param router The current router instance
|
||||||
|
* @return The interval to use for next reconnect
|
||||||
|
* or 0 if router->retry_limit has been hit.
|
||||||
|
*/
|
||||||
|
static int blr_check_connect_retry(ROUTER_INSTANCE *router)
|
||||||
|
{
|
||||||
|
/* Stop reconnection to master */
|
||||||
|
if (router->retry_backoff >= BLR_MASTER_RETRY_COUNT)
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Return the interval for next reconnect */
|
||||||
|
if (router->retry_backoff >= BLR_MASTER_CONNECT_RETRY / BLR_MASTER_BACKOFF_TIME)
|
||||||
|
{
|
||||||
|
return BLR_MASTER_CONNECT_RETRY;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return BLR_MASTER_BACKOFF_TIME * (1 + router->retry_backoff);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@ -3620,7 +3620,8 @@ blr_start_slave(ROUTER_INSTANCE* router, ROUTER_SLAVE* slave)
|
|||||||
|
|
||||||
/* if running return an error */
|
/* if running return an error */
|
||||||
if (router->master_state != BLRM_UNCONNECTED &&
|
if (router->master_state != BLRM_UNCONNECTED &&
|
||||||
router->master_state != BLRM_SLAVE_STOPPED)
|
router->master_state != BLRM_SLAVE_STOPPED &&
|
||||||
|
router->master_state != BLRM_CONNECTING)
|
||||||
{
|
{
|
||||||
blr_slave_send_warning_message(router,
|
blr_slave_send_warning_message(router,
|
||||||
slave,
|
slave,
|
||||||
@ -3631,6 +3632,7 @@ blr_start_slave(ROUTER_INSTANCE* router, ROUTER_SLAVE* slave)
|
|||||||
|
|
||||||
spinlock_acquire(&router->lock);
|
spinlock_acquire(&router->lock);
|
||||||
router->master_state = BLRM_UNCONNECTED;
|
router->master_state = BLRM_UNCONNECTED;
|
||||||
|
router->retry_backoff = 0;
|
||||||
spinlock_release(&router->lock);
|
spinlock_release(&router->lock);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
Reference in New Issue
Block a user