MXS-2223 Log a message when a slave is discriminated due to replication lag
Both the replication lag and the message printing state are saved in SERVER, although the values are mostly used by readwritesplit. A log message is printed both when a server goes over the limit and when it comes back below. Because of concurrency issues, a message may be printed multiple times before different threads detect the new message state. Documentation updated to explain the change.
This commit is contained in:
@ -109,12 +109,14 @@ This feature is disabled by default.
|
|||||||
|
|
||||||
max_slave_replication_lag=<allowed lag in seconds>
|
max_slave_replication_lag=<allowed lag in seconds>
|
||||||
|
|
||||||
This applies to Master/Slave replication with MySQL monitor and
|
The Readwritesplit-router does not detect the replication lag itself. A monitor
|
||||||
`detect_replication_lag=1` options set. max_slave_replication_lag must be
|
such as the MariaDB-monitor for a Master/Slave-cluster is required. This option
|
||||||
greater than the monitor interval.
|
only affects Master-Slave clusters. Galera clusters do not have a concept of
|
||||||
|
slave lag even if the application of write sets might have lag. When a server is
|
||||||
This option only affects Master-Slave clusters. Galera clusters do not have a
|
disqualified from routing because of replication lag, a warning is logged. Similarly,
|
||||||
concept of slave lag even if the application of write sets might have lag.
|
when the server has caught up enough to be a valid routing target, another warning
|
||||||
|
is logged. These messages are only logged when a query is being routed and the
|
||||||
|
replication state changes.
|
||||||
|
|
||||||
### `use_sql_variables_in`
|
### `use_sql_variables_in`
|
||||||
|
|
||||||
|
@ -94,6 +94,13 @@ typedef enum
|
|||||||
SERVER_TYPE_MYSQL
|
SERVER_TYPE_MYSQL
|
||||||
} server_type_t;
|
} server_type_t;
|
||||||
|
|
||||||
|
typedef enum
|
||||||
|
{
|
||||||
|
RLAG_NONE,
|
||||||
|
RLAG_BELOW_LIMIT,
|
||||||
|
RLAG_ABOVE_LIMIT
|
||||||
|
} RLAG_STATE;
|
||||||
|
|
||||||
static inline void server_decode_version(uint64_t version, SERVER_VERSION* server_version)
|
static inline void server_decode_version(uint64_t version, SERVER_VERSION* server_version)
|
||||||
{
|
{
|
||||||
uint32_t major = version / 10000;
|
uint32_t major = version / 10000;
|
||||||
@ -159,11 +166,13 @@ typedef struct server
|
|||||||
* */
|
* */
|
||||||
unsigned long node_ts; /**< Last timestamp set from M/S monitor module */
|
unsigned long node_ts; /**< Last timestamp set from M/S monitor module */
|
||||||
long master_id; /**< Master server id of this node */
|
long master_id; /**< Master server id of this node */
|
||||||
|
|
||||||
// Misc fields
|
// Misc fields
|
||||||
bool master_err_is_logged; /**< If node failed, this indicates whether it is logged. Only
|
bool master_err_is_logged; /**< If node failed, this indicates whether it is logged. Only used
|
||||||
* used
|
|
||||||
* by rwsplit. TODO: Move to rwsplit */
|
* by rwsplit. TODO: Move to rwsplit */
|
||||||
bool warn_ssl_not_enabled; /**< SSL not used for an SSL enabled server */
|
bool warn_ssl_not_enabled; /**< SSL not used for an SSL enabled server */
|
||||||
|
RLAG_STATE rlag_state; /**< Is replication lag above or under limit? Used by rwsplit. */
|
||||||
|
|
||||||
MxsDiskSpaceThreshold* disk_space_threshold;/**< Disk space thresholds */
|
MxsDiskSpaceThreshold* disk_space_threshold;/**< Disk space thresholds */
|
||||||
} SERVER;
|
} SERVER;
|
||||||
|
|
||||||
|
@ -183,6 +183,7 @@ SERVER* server_alloc(const char* name, MXS_CONFIG_PARAMETER* params)
|
|||||||
server->master_id = -1;
|
server->master_id = -1;
|
||||||
server->master_err_is_logged = false;
|
server->master_err_is_logged = false;
|
||||||
server->warn_ssl_not_enabled = true;
|
server->warn_ssl_not_enabled = true;
|
||||||
|
server->rlag_state = RLAG_NONE;
|
||||||
server->disk_space_threshold = NULL;
|
server->disk_space_threshold = NULL;
|
||||||
|
|
||||||
if (*monuser && *monpw)
|
if (*monuser && *monpw)
|
||||||
|
@ -19,6 +19,7 @@
|
|||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <strings.h>
|
#include <strings.h>
|
||||||
|
|
||||||
|
#include <maxbase/atomic.hh>
|
||||||
#include <maxscale/alloc.h>
|
#include <maxscale/alloc.h>
|
||||||
#include <maxscale/clock.h>
|
#include <maxscale/clock.h>
|
||||||
#include <maxscale/modutil.h>
|
#include <maxscale/modutil.h>
|
||||||
@ -591,6 +592,38 @@ SRWBackend RWSplitSession::get_hinted_backend(char* name)
|
|||||||
return rval;
|
return rval;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Change server replication lag state and log warning when state changes.
|
||||||
|
*
|
||||||
|
* @param backend Affected server
|
||||||
|
* @param new_state New replication lag state
|
||||||
|
* @param max_rlag Maximum allowed lag. Used for the log message.
|
||||||
|
*/
|
||||||
|
static void change_rlag_state(SRWBackend& backend, RLAG_STATE new_state, int max_rlag)
|
||||||
|
{
|
||||||
|
mxb_assert(new_state == RLAG_BELOW_LIMIT || new_state == RLAG_ABOVE_LIMIT);
|
||||||
|
namespace atom = maxbase::atomic;
|
||||||
|
auto srv = backend->server();
|
||||||
|
auto old_state = atom::load(&srv->rlag_state, atom::RELAXED);
|
||||||
|
if (new_state != old_state)
|
||||||
|
{
|
||||||
|
atom::store(&srv->rlag_state, new_state, atom::RELAXED);
|
||||||
|
// State has just changed, log warning. Don't log catchup if old state was RLAG_NONE.
|
||||||
|
if (new_state == RLAG_ABOVE_LIMIT)
|
||||||
|
{
|
||||||
|
MXS_WARNING("Replication lag of '%s' is %is, which is above the configured limit %is. "
|
||||||
|
"'%s' is excluded from query routing.",
|
||||||
|
srv->name, srv->rlag, max_rlag, srv->name);
|
||||||
|
}
|
||||||
|
else if (old_state == RLAG_ABOVE_LIMIT)
|
||||||
|
{
|
||||||
|
MXS_WARNING("Replication lag of '%s' is %is, which is below the allowed limit %is. "
|
||||||
|
"'%s' is returned to query routing.",
|
||||||
|
srv->name, srv->rlag, max_rlag, srv->name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
SRWBackend RWSplitSession::get_slave_backend(int max_rlag)
|
SRWBackend RWSplitSession::get_slave_backend(int max_rlag)
|
||||||
{
|
{
|
||||||
// create a list of useable backends (includes masters, function name is a bit off),
|
// create a list of useable backends (includes masters, function name is a bit off),
|
||||||
@ -607,14 +640,25 @@ SRWBackend RWSplitSession::get_slave_backend(int max_rlag)
|
|||||||
&& counts.second < m_router->max_slave_count();
|
&& counts.second < m_router->max_slave_count();
|
||||||
|
|
||||||
bool master_or_slave = backend->is_master() || backend->is_slave();
|
bool master_or_slave = backend->is_master() || backend->is_slave();
|
||||||
bool is_useable = backend->in_use() || can_take_slave_into_use;
|
bool is_usable = backend->in_use() || can_take_slave_into_use;
|
||||||
bool not_a_slacker = rpl_lag_is_ok(backend, max_rlag);
|
bool rlag_ok = rpl_lag_is_ok(backend, max_rlag);
|
||||||
|
|
||||||
bool server_is_candidate = master_or_slave && is_useable && not_a_slacker;
|
if (master_or_slave && is_usable)
|
||||||
|
{
|
||||||
if (server_is_candidate)
|
if (rlag_ok)
|
||||||
{
|
{
|
||||||
candidates.push_back(&backend);
|
candidates.push_back(&backend);
|
||||||
|
if (max_rlag > 0)
|
||||||
|
{
|
||||||
|
// Replication lag discrimination is on and the server passed.
|
||||||
|
change_rlag_state(backend, RLAG_BELOW_LIMIT, max_rlag);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// The server is otherwise usable except it's lagging too much.
|
||||||
|
change_rlag_state(backend, RLAG_ABOVE_LIMIT, max_rlag);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user