MXS-756: Retry reads on slave failure

If a slave fails while a non-critical read is being executed, the read is
retried on a different server. This is controlled by the new
`retry_failed_reads` option.

Only selects done that are done outside of a transaction and with
autocommit enabled are retried.
This commit is contained in:
Markus Makela
2016-12-06 00:32:40 +02:00
parent a66c8fb919
commit 7a04259fc0
5 changed files with 102 additions and 16 deletions

View File

@ -148,6 +148,16 @@ as slave servers are available.
to the master is lost, clients will not be able to execute write queries without to the master is lost, clients will not be able to execute write queries without
reconnecting to MariaDB MaxScale once a new master is available. reconnecting to MariaDB MaxScale once a new master is available.
### `retry_failed_reads`
This option controls whether autocommit selects are retried in case of
failure. This option is enabled by default.
When a simple autocommit select is being executed outside of a transaction
and the slave server where the query is being executed fails,
readwritesplit can retry the read on a replacement server. This makes the
failure of a slave transparent to the client.
## Routing hints ## Routing hints
The readwritesplit router supports routing hints. For a detailed guide on hint The readwritesplit router supports routing hints. For a detailed guide on hint

View File

@ -198,6 +198,9 @@ static ROUTER *createInstance(SERVICE *service, char **options)
* failure is detected */ * failure is detected */
router->rwsplit_config.rw_master_failure_mode = RW_FAIL_INSTANTLY; router->rwsplit_config.rw_master_failure_mode = RW_FAIL_INSTANTLY;
/** Try to retry failed reads */
router->rwsplit_config.rw_retry_failed_reads = true;
/** Call this before refreshInstance */ /** Call this before refreshInstance */
if (options && !rwsplit_process_router_options(router, options)) if (options && !rwsplit_process_router_options(router, options))
{ {
@ -670,6 +673,10 @@ static void clientReply(ROUTER *instance, void *router_session, GWBUF *writebuf,
CHK_BACKEND_REF(bref); CHK_BACKEND_REF(bref);
scur = &bref->bref_sescmd_cur; scur = &bref->bref_sescmd_cur;
/** Statement was successfully executed, free the stored statement */
session_clear_stmt(backend_dcb->session);
/** /**
* Active cursor means that reply is from session command * Active cursor means that reply is from session command
* execution. * execution.
@ -804,7 +811,7 @@ lock_failed:
*/ */
static uint64_t getCapabilities(void) static uint64_t getCapabilities(void)
{ {
return RCAP_TYPE_STMT_INPUT; return RCAP_TYPE_STMT_INPUT | RCAP_TYPE_TRANSACTION_TRACKING;
} }
/* /*
@ -1233,6 +1240,10 @@ static bool rwsplit_process_router_options(ROUTER_INSTANCE *router,
{ {
router->rwsplit_config.rw_strict_multi_stmt = config_truth_value(value); router->rwsplit_config.rw_strict_multi_stmt = config_truth_value(value);
} }
else if (strcmp(options[i], "retry_failed_reads") == 0)
{
router->rwsplit_config.rw_retry_failed_reads = config_truth_value(value);
}
else if (strcmp(options[i], "master_failure_mode") == 0) else if (strcmp(options[i], "master_failure_mode") == 0)
{ {
if (strcasecmp(value, "fail_instantly") == 0) if (strcasecmp(value, "fail_instantly") == 0)
@ -1504,6 +1515,53 @@ static void handle_error_reply_client(SESSION *ses, ROUTER_CLIENT_SES *rses,
} }
} }
static bool reroute_stored_statement(ROUTER_CLIENT_SES *rses, backend_ref_t *old, GWBUF *stored)
{
bool success = false;
if (!session_trx_is_active(rses->client_dcb->session))
{
/**
* Only try to retry the read if autocommit is enabled and we are
* outside of a transaction
*/
for (int i = 0; i < rses->rses_nbackends; i++)
{
backend_ref_t *bref = &rses->rses_backend_ref[i];
if (BREF_IS_IN_USE(bref) && bref != old &&
!SERVER_IS_MASTER(bref->ref->server) &&
SERVER_IS_SLAVE(bref->ref->server))
{
/** Found a valid candidate; a non-master slave that's in use */
if (bref->bref_dcb->func.write(bref->bref_dcb, stored))
{
MXS_INFO("Retrying failed read at '%s'.", bref->ref->server->unique_name);
success = true;
break;
}
}
}
if (!success && rses->rses_master_ref && BREF_IS_IN_USE(rses->rses_master_ref))
{
/**
* Either we failed to write to the slave or no valid slave was found.
* Try to retry the read on the master.
*/
backend_ref_t *bref = rses->rses_master_ref;
if (bref->bref_dcb->func.write(bref->bref_dcb, stored))
{
MXS_INFO("Retrying failed read at '%s'.", bref->ref->server->unique_name);
success = true;
}
}
}
return success;
}
/** /**
* Check if there is backend reference pointing at failed DCB, and reset its * Check if there is backend reference pointing at failed DCB, and reset its
* flags. Then clear DCB's callback and finally : try to find replacement(s) * flags. Then clear DCB's callback and finally : try to find replacement(s)
@ -1541,8 +1599,7 @@ static bool handle_error_new_connection(ROUTER_INSTANCE *inst,
*/ */
if ((bref = get_bref_from_dcb(myrses, backend_dcb)) == NULL) if ((bref = get_bref_from_dcb(myrses, backend_dcb)) == NULL)
{ {
succp = true; return true;
goto return_succp;
} }
CHK_BACKEND_REF(bref); CHK_BACKEND_REF(bref);
@ -1553,9 +1610,23 @@ static bool handle_error_new_connection(ROUTER_INSTANCE *inst,
*/ */
if (BREF_IS_WAITING_RESULT(bref)) if (BREF_IS_WAITING_RESULT(bref))
{ {
GWBUF *stored;
const SERVER *target;
if (!session_take_stmt(backend_dcb->session, &stored, &target) ||
target != bref->ref->server ||
!reroute_stored_statement(*rses, bref, stored))
{
/**
* We failed to route the stored statement or no statement was
* stored for this server. Either way we can safely free the buffer.
*/
gwbuf_free(stored);
DCB *client_dcb = ses->client_dcb; DCB *client_dcb = ses->client_dcb;
client_dcb->func.write(client_dcb, gwbuf_clone(errmsg)); client_dcb->func.write(client_dcb, gwbuf_clone(errmsg));
} }
}
close_failed_bref(bref, false); close_failed_bref(bref, false);
@ -1566,8 +1637,7 @@ static bool handle_error_new_connection(ROUTER_INSTANCE *inst,
*/ */
if (backend_dcb->state != DCB_STATE_POLLING) if (backend_dcb->state != DCB_STATE_POLLING)
{ {
succp = true; return true;
goto return_succp;
} }
/** /**
* Remove callback because this DCB won't be used * Remove callback because this DCB won't be used
@ -1597,7 +1667,6 @@ static bool handle_error_new_connection(ROUTER_INSTANCE *inst,
ses, inst, true); ses, inst, true);
} }
return_succp:
return succp; return succp;
} }

View File

@ -229,6 +229,7 @@ typedef struct rwsplit_config_st
* to the master after a multistatement query. */ * to the master after a multistatement query. */
enum failure_mode rw_master_failure_mode; /**< Master server failure handling mode. enum failure_mode rw_master_failure_mode; /**< Master server failure handling mode.
* @see enum failure_mode */ * @see enum failure_mode */
bool rw_retry_failed_reads; /**< Retry failed reads on other servers */
} rwsplit_config_t; } rwsplit_config_t;
#if defined(PREP_STMT_CACHING) #if defined(PREP_STMT_CACHING)

View File

@ -100,7 +100,7 @@ bool handle_slave_is_target(ROUTER_INSTANCE *inst, ROUTER_CLIENT_SES *rses,
bool handle_master_is_target(ROUTER_INSTANCE *inst, ROUTER_CLIENT_SES *rses, bool handle_master_is_target(ROUTER_INSTANCE *inst, ROUTER_CLIENT_SES *rses,
DCB **target_dcb); DCB **target_dcb);
bool handle_got_target(ROUTER_INSTANCE *inst, ROUTER_CLIENT_SES *rses, bool handle_got_target(ROUTER_INSTANCE *inst, ROUTER_CLIENT_SES *rses,
GWBUF *querybuf, DCB *target_dcb); GWBUF *querybuf, DCB *target_dcb, bool store);
bool route_session_write(ROUTER_CLIENT_SES *router_cli_ses, bool route_session_write(ROUTER_CLIENT_SES *router_cli_ses,
GWBUF *querybuf, ROUTER_INSTANCE *inst, GWBUF *querybuf, ROUTER_INSTANCE *inst,
int packet_type, int packet_type,

View File

@ -152,8 +152,7 @@ bool route_single_stmt(ROUTER_INSTANCE *inst, ROUTER_CLIENT_SES *rses,
else if (rses_begin_locked_router_action(rses)) else if (rses_begin_locked_router_action(rses))
{ {
/* Now we have a lock on the router session */ /* Now we have a lock on the router session */
DCB *master_dcb = rses->rses_master_ref ? rses->rses_master_ref->bref_dcb : NULL; bool store_stmt = false;
/** /**
* There is a hint which either names the target backend or * There is a hint which either names the target backend or
* hint which sets maximum allowed replication lag for the * hint which sets maximum allowed replication lag for the
@ -167,6 +166,7 @@ bool route_single_stmt(ROUTER_INSTANCE *inst, ROUTER_CLIENT_SES *rses,
else if (TARGET_IS_SLAVE(route_target)) else if (TARGET_IS_SLAVE(route_target))
{ {
succp = handle_slave_is_target(inst, rses, &target_dcb); succp = handle_slave_is_target(inst, rses, &target_dcb);
store_stmt = rses->rses_config.rw_retry_failed_reads;
} }
else if (TARGET_IS_MASTER(route_target)) else if (TARGET_IS_MASTER(route_target))
{ {
@ -175,7 +175,8 @@ bool route_single_stmt(ROUTER_INSTANCE *inst, ROUTER_CLIENT_SES *rses,
if (target_dcb && succp) /*< Have DCB of the target backend */ if (target_dcb && succp) /*< Have DCB of the target backend */
{ {
handle_got_target(inst, rses, querybuf, target_dcb); ss_dassert(!store_stmt || TARGET_IS_SLAVE(route_target));
handle_got_target(inst, rses, querybuf, target_dcb, store_stmt);
} }
rses_end_locked_router_action(rses); rses_end_locked_router_action(rses);
} }
@ -1228,7 +1229,7 @@ bool handle_master_is_target(ROUTER_INSTANCE *inst, ROUTER_CLIENT_SES *rses,
*/ */
bool bool
handle_got_target(ROUTER_INSTANCE *inst, ROUTER_CLIENT_SES *rses, handle_got_target(ROUTER_INSTANCE *inst, ROUTER_CLIENT_SES *rses,
GWBUF *querybuf, DCB *target_dcb) GWBUF *querybuf, DCB *target_dcb, bool store)
{ {
backend_ref_t *bref; backend_ref_t *bref;
sescmd_cursor_t *scur; sescmd_cursor_t *scur;
@ -1254,6 +1255,11 @@ handle_got_target(ROUTER_INSTANCE *inst, ROUTER_CLIENT_SES *rses,
if (target_dcb->func.write(target_dcb, gwbuf_clone(querybuf)) == 1) if (target_dcb->func.write(target_dcb, gwbuf_clone(querybuf)) == 1)
{ {
if (store && !session_store_stmt(rses->client_dcb->session, querybuf, target_dcb->server))
{
MXS_ERROR("Failed to store current statement, it won't be retried if it fails.");
}
backend_ref_t *bref; backend_ref_t *bref;
atomic_add(&inst->stats.n_queries, 1); atomic_add(&inst->stats.n_queries, 1);