MXS-756: Retry reads on slave failure
If a slave fails while a non-critical read is being executed, the read is retried on a different server. This is controlled by the new `retry_failed_reads` option. Only selects done that are done outside of a transaction and with autocommit enabled are retried.
This commit is contained in:
@ -148,6 +148,16 @@ as slave servers are available.
|
|||||||
to the master is lost, clients will not be able to execute write queries without
|
to the master is lost, clients will not be able to execute write queries without
|
||||||
reconnecting to MariaDB MaxScale once a new master is available.
|
reconnecting to MariaDB MaxScale once a new master is available.
|
||||||
|
|
||||||
|
### `retry_failed_reads`
|
||||||
|
|
||||||
|
This option controls whether autocommit selects are retried in case of
|
||||||
|
failure. This option is enabled by default.
|
||||||
|
|
||||||
|
When a simple autocommit select is being executed outside of a transaction
|
||||||
|
and the slave server where the query is being executed fails,
|
||||||
|
readwritesplit can retry the read on a replacement server. This makes the
|
||||||
|
failure of a slave transparent to the client.
|
||||||
|
|
||||||
## Routing hints
|
## Routing hints
|
||||||
|
|
||||||
The readwritesplit router supports routing hints. For a detailed guide on hint
|
The readwritesplit router supports routing hints. For a detailed guide on hint
|
||||||
|
@ -198,6 +198,9 @@ static ROUTER *createInstance(SERVICE *service, char **options)
|
|||||||
* failure is detected */
|
* failure is detected */
|
||||||
router->rwsplit_config.rw_master_failure_mode = RW_FAIL_INSTANTLY;
|
router->rwsplit_config.rw_master_failure_mode = RW_FAIL_INSTANTLY;
|
||||||
|
|
||||||
|
/** Try to retry failed reads */
|
||||||
|
router->rwsplit_config.rw_retry_failed_reads = true;
|
||||||
|
|
||||||
/** Call this before refreshInstance */
|
/** Call this before refreshInstance */
|
||||||
if (options && !rwsplit_process_router_options(router, options))
|
if (options && !rwsplit_process_router_options(router, options))
|
||||||
{
|
{
|
||||||
@ -670,6 +673,10 @@ static void clientReply(ROUTER *instance, void *router_session, GWBUF *writebuf,
|
|||||||
|
|
||||||
CHK_BACKEND_REF(bref);
|
CHK_BACKEND_REF(bref);
|
||||||
scur = &bref->bref_sescmd_cur;
|
scur = &bref->bref_sescmd_cur;
|
||||||
|
|
||||||
|
/** Statement was successfully executed, free the stored statement */
|
||||||
|
session_clear_stmt(backend_dcb->session);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Active cursor means that reply is from session command
|
* Active cursor means that reply is from session command
|
||||||
* execution.
|
* execution.
|
||||||
@ -804,7 +811,7 @@ lock_failed:
|
|||||||
*/
|
*/
|
||||||
static uint64_t getCapabilities(void)
|
static uint64_t getCapabilities(void)
|
||||||
{
|
{
|
||||||
return RCAP_TYPE_STMT_INPUT;
|
return RCAP_TYPE_STMT_INPUT | RCAP_TYPE_TRANSACTION_TRACKING;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -1233,6 +1240,10 @@ static bool rwsplit_process_router_options(ROUTER_INSTANCE *router,
|
|||||||
{
|
{
|
||||||
router->rwsplit_config.rw_strict_multi_stmt = config_truth_value(value);
|
router->rwsplit_config.rw_strict_multi_stmt = config_truth_value(value);
|
||||||
}
|
}
|
||||||
|
else if (strcmp(options[i], "retry_failed_reads") == 0)
|
||||||
|
{
|
||||||
|
router->rwsplit_config.rw_retry_failed_reads = config_truth_value(value);
|
||||||
|
}
|
||||||
else if (strcmp(options[i], "master_failure_mode") == 0)
|
else if (strcmp(options[i], "master_failure_mode") == 0)
|
||||||
{
|
{
|
||||||
if (strcasecmp(value, "fail_instantly") == 0)
|
if (strcasecmp(value, "fail_instantly") == 0)
|
||||||
@ -1366,8 +1377,8 @@ static void handleError(ROUTER *instance, void *router_session,
|
|||||||
MXS_ERROR("Server %s:%d lost the master status. Readwritesplit "
|
MXS_ERROR("Server %s:%d lost the master status. Readwritesplit "
|
||||||
"service can't locate the master. Client sessions "
|
"service can't locate the master. Client sessions "
|
||||||
"will be closed.", srv->name, srv->port);
|
"will be closed.", srv->name, srv->port);
|
||||||
srv->master_err_is_logged = true;
|
srv->master_err_is_logged = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
*succp = can_continue;
|
*succp = can_continue;
|
||||||
|
|
||||||
@ -1380,7 +1391,7 @@ static void handleError(ROUTER *instance, void *router_session,
|
|||||||
{
|
{
|
||||||
MXS_ERROR("Server %s:%d lost the master status but could not locate the "
|
MXS_ERROR("Server %s:%d lost the master status but could not locate the "
|
||||||
"corresponding backend ref.", srv->name, srv->port);
|
"corresponding backend ref.", srv->name, srv->port);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (bref)
|
else if (bref)
|
||||||
{
|
{
|
||||||
@ -1504,6 +1515,53 @@ static void handle_error_reply_client(SESSION *ses, ROUTER_CLIENT_SES *rses,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool reroute_stored_statement(ROUTER_CLIENT_SES *rses, backend_ref_t *old, GWBUF *stored)
|
||||||
|
{
|
||||||
|
bool success = false;
|
||||||
|
|
||||||
|
if (!session_trx_is_active(rses->client_dcb->session))
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* Only try to retry the read if autocommit is enabled and we are
|
||||||
|
* outside of a transaction
|
||||||
|
*/
|
||||||
|
for (int i = 0; i < rses->rses_nbackends; i++)
|
||||||
|
{
|
||||||
|
backend_ref_t *bref = &rses->rses_backend_ref[i];
|
||||||
|
|
||||||
|
if (BREF_IS_IN_USE(bref) && bref != old &&
|
||||||
|
!SERVER_IS_MASTER(bref->ref->server) &&
|
||||||
|
SERVER_IS_SLAVE(bref->ref->server))
|
||||||
|
{
|
||||||
|
/** Found a valid candidate; a non-master slave that's in use */
|
||||||
|
if (bref->bref_dcb->func.write(bref->bref_dcb, stored))
|
||||||
|
{
|
||||||
|
MXS_INFO("Retrying failed read at '%s'.", bref->ref->server->unique_name);
|
||||||
|
success = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!success && rses->rses_master_ref && BREF_IS_IN_USE(rses->rses_master_ref))
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* Either we failed to write to the slave or no valid slave was found.
|
||||||
|
* Try to retry the read on the master.
|
||||||
|
*/
|
||||||
|
backend_ref_t *bref = rses->rses_master_ref;
|
||||||
|
|
||||||
|
if (bref->bref_dcb->func.write(bref->bref_dcb, stored))
|
||||||
|
{
|
||||||
|
MXS_INFO("Retrying failed read at '%s'.", bref->ref->server->unique_name);
|
||||||
|
success = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return success;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Check if there is backend reference pointing at failed DCB, and reset its
|
* Check if there is backend reference pointing at failed DCB, and reset its
|
||||||
* flags. Then clear DCB's callback and finally : try to find replacement(s)
|
* flags. Then clear DCB's callback and finally : try to find replacement(s)
|
||||||
@ -1541,8 +1599,7 @@ static bool handle_error_new_connection(ROUTER_INSTANCE *inst,
|
|||||||
*/
|
*/
|
||||||
if ((bref = get_bref_from_dcb(myrses, backend_dcb)) == NULL)
|
if ((bref = get_bref_from_dcb(myrses, backend_dcb)) == NULL)
|
||||||
{
|
{
|
||||||
succp = true;
|
return true;
|
||||||
goto return_succp;
|
|
||||||
}
|
}
|
||||||
CHK_BACKEND_REF(bref);
|
CHK_BACKEND_REF(bref);
|
||||||
|
|
||||||
@ -1553,8 +1610,22 @@ static bool handle_error_new_connection(ROUTER_INSTANCE *inst,
|
|||||||
*/
|
*/
|
||||||
if (BREF_IS_WAITING_RESULT(bref))
|
if (BREF_IS_WAITING_RESULT(bref))
|
||||||
{
|
{
|
||||||
DCB *client_dcb = ses->client_dcb;
|
GWBUF *stored;
|
||||||
client_dcb->func.write(client_dcb, gwbuf_clone(errmsg));
|
const SERVER *target;
|
||||||
|
|
||||||
|
if (!session_take_stmt(backend_dcb->session, &stored, &target) ||
|
||||||
|
target != bref->ref->server ||
|
||||||
|
!reroute_stored_statement(*rses, bref, stored))
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* We failed to route the stored statement or no statement was
|
||||||
|
* stored for this server. Either way we can safely free the buffer.
|
||||||
|
*/
|
||||||
|
gwbuf_free(stored);
|
||||||
|
|
||||||
|
DCB *client_dcb = ses->client_dcb;
|
||||||
|
client_dcb->func.write(client_dcb, gwbuf_clone(errmsg));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
close_failed_bref(bref, false);
|
close_failed_bref(bref, false);
|
||||||
@ -1566,8 +1637,7 @@ static bool handle_error_new_connection(ROUTER_INSTANCE *inst,
|
|||||||
*/
|
*/
|
||||||
if (backend_dcb->state != DCB_STATE_POLLING)
|
if (backend_dcb->state != DCB_STATE_POLLING)
|
||||||
{
|
{
|
||||||
succp = true;
|
return true;
|
||||||
goto return_succp;
|
|
||||||
}
|
}
|
||||||
/**
|
/**
|
||||||
* Remove callback because this DCB won't be used
|
* Remove callback because this DCB won't be used
|
||||||
@ -1597,7 +1667,6 @@ static bool handle_error_new_connection(ROUTER_INSTANCE *inst,
|
|||||||
ses, inst, true);
|
ses, inst, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
return_succp:
|
|
||||||
return succp;
|
return succp;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -229,6 +229,7 @@ typedef struct rwsplit_config_st
|
|||||||
* to the master after a multistatement query. */
|
* to the master after a multistatement query. */
|
||||||
enum failure_mode rw_master_failure_mode; /**< Master server failure handling mode.
|
enum failure_mode rw_master_failure_mode; /**< Master server failure handling mode.
|
||||||
* @see enum failure_mode */
|
* @see enum failure_mode */
|
||||||
|
bool rw_retry_failed_reads; /**< Retry failed reads on other servers */
|
||||||
} rwsplit_config_t;
|
} rwsplit_config_t;
|
||||||
|
|
||||||
#if defined(PREP_STMT_CACHING)
|
#if defined(PREP_STMT_CACHING)
|
||||||
|
@ -100,7 +100,7 @@ bool handle_slave_is_target(ROUTER_INSTANCE *inst, ROUTER_CLIENT_SES *rses,
|
|||||||
bool handle_master_is_target(ROUTER_INSTANCE *inst, ROUTER_CLIENT_SES *rses,
|
bool handle_master_is_target(ROUTER_INSTANCE *inst, ROUTER_CLIENT_SES *rses,
|
||||||
DCB **target_dcb);
|
DCB **target_dcb);
|
||||||
bool handle_got_target(ROUTER_INSTANCE *inst, ROUTER_CLIENT_SES *rses,
|
bool handle_got_target(ROUTER_INSTANCE *inst, ROUTER_CLIENT_SES *rses,
|
||||||
GWBUF *querybuf, DCB *target_dcb);
|
GWBUF *querybuf, DCB *target_dcb, bool store);
|
||||||
bool route_session_write(ROUTER_CLIENT_SES *router_cli_ses,
|
bool route_session_write(ROUTER_CLIENT_SES *router_cli_ses,
|
||||||
GWBUF *querybuf, ROUTER_INSTANCE *inst,
|
GWBUF *querybuf, ROUTER_INSTANCE *inst,
|
||||||
int packet_type,
|
int packet_type,
|
||||||
|
@ -152,8 +152,7 @@ bool route_single_stmt(ROUTER_INSTANCE *inst, ROUTER_CLIENT_SES *rses,
|
|||||||
else if (rses_begin_locked_router_action(rses))
|
else if (rses_begin_locked_router_action(rses))
|
||||||
{
|
{
|
||||||
/* Now we have a lock on the router session */
|
/* Now we have a lock on the router session */
|
||||||
DCB *master_dcb = rses->rses_master_ref ? rses->rses_master_ref->bref_dcb : NULL;
|
bool store_stmt = false;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* There is a hint which either names the target backend or
|
* There is a hint which either names the target backend or
|
||||||
* hint which sets maximum allowed replication lag for the
|
* hint which sets maximum allowed replication lag for the
|
||||||
@ -167,6 +166,7 @@ bool route_single_stmt(ROUTER_INSTANCE *inst, ROUTER_CLIENT_SES *rses,
|
|||||||
else if (TARGET_IS_SLAVE(route_target))
|
else if (TARGET_IS_SLAVE(route_target))
|
||||||
{
|
{
|
||||||
succp = handle_slave_is_target(inst, rses, &target_dcb);
|
succp = handle_slave_is_target(inst, rses, &target_dcb);
|
||||||
|
store_stmt = rses->rses_config.rw_retry_failed_reads;
|
||||||
}
|
}
|
||||||
else if (TARGET_IS_MASTER(route_target))
|
else if (TARGET_IS_MASTER(route_target))
|
||||||
{
|
{
|
||||||
@ -175,7 +175,8 @@ bool route_single_stmt(ROUTER_INSTANCE *inst, ROUTER_CLIENT_SES *rses,
|
|||||||
|
|
||||||
if (target_dcb && succp) /*< Have DCB of the target backend */
|
if (target_dcb && succp) /*< Have DCB of the target backend */
|
||||||
{
|
{
|
||||||
handle_got_target(inst, rses, querybuf, target_dcb);
|
ss_dassert(!store_stmt || TARGET_IS_SLAVE(route_target));
|
||||||
|
handle_got_target(inst, rses, querybuf, target_dcb, store_stmt);
|
||||||
}
|
}
|
||||||
rses_end_locked_router_action(rses);
|
rses_end_locked_router_action(rses);
|
||||||
}
|
}
|
||||||
@ -1228,7 +1229,7 @@ bool handle_master_is_target(ROUTER_INSTANCE *inst, ROUTER_CLIENT_SES *rses,
|
|||||||
*/
|
*/
|
||||||
bool
|
bool
|
||||||
handle_got_target(ROUTER_INSTANCE *inst, ROUTER_CLIENT_SES *rses,
|
handle_got_target(ROUTER_INSTANCE *inst, ROUTER_CLIENT_SES *rses,
|
||||||
GWBUF *querybuf, DCB *target_dcb)
|
GWBUF *querybuf, DCB *target_dcb, bool store)
|
||||||
{
|
{
|
||||||
backend_ref_t *bref;
|
backend_ref_t *bref;
|
||||||
sescmd_cursor_t *scur;
|
sescmd_cursor_t *scur;
|
||||||
@ -1254,6 +1255,11 @@ handle_got_target(ROUTER_INSTANCE *inst, ROUTER_CLIENT_SES *rses,
|
|||||||
|
|
||||||
if (target_dcb->func.write(target_dcb, gwbuf_clone(querybuf)) == 1)
|
if (target_dcb->func.write(target_dcb, gwbuf_clone(querybuf)) == 1)
|
||||||
{
|
{
|
||||||
|
if (store && !session_store_stmt(rses->client_dcb->session, querybuf, target_dcb->server))
|
||||||
|
{
|
||||||
|
MXS_ERROR("Failed to store current statement, it won't be retried if it fails.");
|
||||||
|
}
|
||||||
|
|
||||||
backend_ref_t *bref;
|
backend_ref_t *bref;
|
||||||
|
|
||||||
atomic_add(&inst->stats.n_queries, 1);
|
atomic_add(&inst->stats.n_queries, 1);
|
||||||
|
Reference in New Issue
Block a user