MXS-756: Retry reads on slave failure

If a slave fails while a non-critical read is being executed, the read is
retried on a different server. This is controlled by the new
`retry_failed_reads` option.

Only selects done that are done outside of a transaction and with
autocommit enabled are retried.
This commit is contained in:
Markus Makela
2016-12-06 00:32:40 +02:00
parent a66c8fb919
commit 7a04259fc0
5 changed files with 102 additions and 16 deletions

View File

@ -198,6 +198,9 @@ static ROUTER *createInstance(SERVICE *service, char **options)
* failure is detected */
router->rwsplit_config.rw_master_failure_mode = RW_FAIL_INSTANTLY;
/** Try to retry failed reads */
router->rwsplit_config.rw_retry_failed_reads = true;
/** Call this before refreshInstance */
if (options && !rwsplit_process_router_options(router, options))
{
@ -670,6 +673,10 @@ static void clientReply(ROUTER *instance, void *router_session, GWBUF *writebuf,
CHK_BACKEND_REF(bref);
scur = &bref->bref_sescmd_cur;
/** Statement was successfully executed, free the stored statement */
session_clear_stmt(backend_dcb->session);
/**
* Active cursor means that reply is from session command
* execution.
@ -804,7 +811,7 @@ lock_failed:
*/
static uint64_t getCapabilities(void)
{
return RCAP_TYPE_STMT_INPUT;
return RCAP_TYPE_STMT_INPUT | RCAP_TYPE_TRANSACTION_TRACKING;
}
/*
@ -1233,6 +1240,10 @@ static bool rwsplit_process_router_options(ROUTER_INSTANCE *router,
{
router->rwsplit_config.rw_strict_multi_stmt = config_truth_value(value);
}
else if (strcmp(options[i], "retry_failed_reads") == 0)
{
router->rwsplit_config.rw_retry_failed_reads = config_truth_value(value);
}
else if (strcmp(options[i], "master_failure_mode") == 0)
{
if (strcasecmp(value, "fail_instantly") == 0)
@ -1366,8 +1377,8 @@ static void handleError(ROUTER *instance, void *router_session,
MXS_ERROR("Server %s:%d lost the master status. Readwritesplit "
"service can't locate the master. Client sessions "
"will be closed.", srv->name, srv->port);
srv->master_err_is_logged = true;
}
srv->master_err_is_logged = true;
}
*succp = can_continue;
@ -1380,7 +1391,7 @@ static void handleError(ROUTER *instance, void *router_session,
{
MXS_ERROR("Server %s:%d lost the master status but could not locate the "
"corresponding backend ref.", srv->name, srv->port);
}
}
}
else if (bref)
{
@ -1504,6 +1515,53 @@ static void handle_error_reply_client(SESSION *ses, ROUTER_CLIENT_SES *rses,
}
}
static bool reroute_stored_statement(ROUTER_CLIENT_SES *rses, backend_ref_t *old, GWBUF *stored)
{
bool success = false;
if (!session_trx_is_active(rses->client_dcb->session))
{
/**
* Only try to retry the read if autocommit is enabled and we are
* outside of a transaction
*/
for (int i = 0; i < rses->rses_nbackends; i++)
{
backend_ref_t *bref = &rses->rses_backend_ref[i];
if (BREF_IS_IN_USE(bref) && bref != old &&
!SERVER_IS_MASTER(bref->ref->server) &&
SERVER_IS_SLAVE(bref->ref->server))
{
/** Found a valid candidate; a non-master slave that's in use */
if (bref->bref_dcb->func.write(bref->bref_dcb, stored))
{
MXS_INFO("Retrying failed read at '%s'.", bref->ref->server->unique_name);
success = true;
break;
}
}
}
if (!success && rses->rses_master_ref && BREF_IS_IN_USE(rses->rses_master_ref))
{
/**
* Either we failed to write to the slave or no valid slave was found.
* Try to retry the read on the master.
*/
backend_ref_t *bref = rses->rses_master_ref;
if (bref->bref_dcb->func.write(bref->bref_dcb, stored))
{
MXS_INFO("Retrying failed read at '%s'.", bref->ref->server->unique_name);
success = true;
}
}
}
return success;
}
/**
* Check if there is backend reference pointing at failed DCB, and reset its
* flags. Then clear DCB's callback and finally : try to find replacement(s)
@ -1541,8 +1599,7 @@ static bool handle_error_new_connection(ROUTER_INSTANCE *inst,
*/
if ((bref = get_bref_from_dcb(myrses, backend_dcb)) == NULL)
{
succp = true;
goto return_succp;
return true;
}
CHK_BACKEND_REF(bref);
@ -1553,8 +1610,22 @@ static bool handle_error_new_connection(ROUTER_INSTANCE *inst,
*/
if (BREF_IS_WAITING_RESULT(bref))
{
DCB *client_dcb = ses->client_dcb;
client_dcb->func.write(client_dcb, gwbuf_clone(errmsg));
GWBUF *stored;
const SERVER *target;
if (!session_take_stmt(backend_dcb->session, &stored, &target) ||
target != bref->ref->server ||
!reroute_stored_statement(*rses, bref, stored))
{
/**
* We failed to route the stored statement or no statement was
* stored for this server. Either way we can safely free the buffer.
*/
gwbuf_free(stored);
DCB *client_dcb = ses->client_dcb;
client_dcb->func.write(client_dcb, gwbuf_clone(errmsg));
}
}
close_failed_bref(bref, false);
@ -1566,8 +1637,7 @@ static bool handle_error_new_connection(ROUTER_INSTANCE *inst,
*/
if (backend_dcb->state != DCB_STATE_POLLING)
{
succp = true;
goto return_succp;
return true;
}
/**
* Remove callback because this DCB won't be used
@ -1597,7 +1667,6 @@ static bool handle_error_new_connection(ROUTER_INSTANCE *inst,
ses, inst, true);
}
return_succp:
return succp;
}

View File

@ -229,6 +229,7 @@ typedef struct rwsplit_config_st
* to the master after a multistatement query. */
enum failure_mode rw_master_failure_mode; /**< Master server failure handling mode.
* @see enum failure_mode */
bool rw_retry_failed_reads; /**< Retry failed reads on other servers */
} rwsplit_config_t;
#if defined(PREP_STMT_CACHING)

View File

@ -100,7 +100,7 @@ bool handle_slave_is_target(ROUTER_INSTANCE *inst, ROUTER_CLIENT_SES *rses,
bool handle_master_is_target(ROUTER_INSTANCE *inst, ROUTER_CLIENT_SES *rses,
DCB **target_dcb);
bool handle_got_target(ROUTER_INSTANCE *inst, ROUTER_CLIENT_SES *rses,
GWBUF *querybuf, DCB *target_dcb);
GWBUF *querybuf, DCB *target_dcb, bool store);
bool route_session_write(ROUTER_CLIENT_SES *router_cli_ses,
GWBUF *querybuf, ROUTER_INSTANCE *inst,
int packet_type,

View File

@ -152,8 +152,7 @@ bool route_single_stmt(ROUTER_INSTANCE *inst, ROUTER_CLIENT_SES *rses,
else if (rses_begin_locked_router_action(rses))
{
/* Now we have a lock on the router session */
DCB *master_dcb = rses->rses_master_ref ? rses->rses_master_ref->bref_dcb : NULL;
bool store_stmt = false;
/**
* There is a hint which either names the target backend or
* hint which sets maximum allowed replication lag for the
@ -167,6 +166,7 @@ bool route_single_stmt(ROUTER_INSTANCE *inst, ROUTER_CLIENT_SES *rses,
else if (TARGET_IS_SLAVE(route_target))
{
succp = handle_slave_is_target(inst, rses, &target_dcb);
store_stmt = rses->rses_config.rw_retry_failed_reads;
}
else if (TARGET_IS_MASTER(route_target))
{
@ -175,7 +175,8 @@ bool route_single_stmt(ROUTER_INSTANCE *inst, ROUTER_CLIENT_SES *rses,
if (target_dcb && succp) /*< Have DCB of the target backend */
{
handle_got_target(inst, rses, querybuf, target_dcb);
ss_dassert(!store_stmt || TARGET_IS_SLAVE(route_target));
handle_got_target(inst, rses, querybuf, target_dcb, store_stmt);
}
rses_end_locked_router_action(rses);
}
@ -1228,7 +1229,7 @@ bool handle_master_is_target(ROUTER_INSTANCE *inst, ROUTER_CLIENT_SES *rses,
*/
bool
handle_got_target(ROUTER_INSTANCE *inst, ROUTER_CLIENT_SES *rses,
GWBUF *querybuf, DCB *target_dcb)
GWBUF *querybuf, DCB *target_dcb, bool store)
{
backend_ref_t *bref;
sescmd_cursor_t *scur;
@ -1254,6 +1255,11 @@ handle_got_target(ROUTER_INSTANCE *inst, ROUTER_CLIENT_SES *rses,
if (target_dcb->func.write(target_dcb, gwbuf_clone(querybuf)) == 1)
{
if (store && !session_store_stmt(rses->client_dcb->session, querybuf, target_dcb->server))
{
MXS_ERROR("Failed to store current statement, it won't be retried if it fails.");
}
backend_ref_t *bref;
atomic_add(&inst->stats.n_queries, 1);