diff --git a/Documentation/Routers/ReadWriteSplit.md b/Documentation/Routers/ReadWriteSplit.md index d4142b9a9..d42aab123 100644 --- a/Documentation/Routers/ReadWriteSplit.md +++ b/Documentation/Routers/ReadWriteSplit.md @@ -109,12 +109,14 @@ This feature is disabled by default. max_slave_replication_lag= -This applies to Master/Slave replication with MySQL monitor and -`detect_replication_lag=1` options set. max_slave_replication_lag must be -greater than the monitor interval. - -This option only affects Master-Slave clusters. Galera clusters do not have a -concept of slave lag even if the application of write sets might have lag. +The Readwritesplit-router does not detect the replication lag itself. A monitor +such as the MariaDB-monitor for a Master/Slave-cluster is required. This option +only affects Master-Slave clusters. Galera clusters do not have a concept of +slave lag even if the application of write sets might have lag. When a server is +disqualified from routing because of replication lag, a warning is logged. Similarly, +when the server has caught up enough to be a valid routing target, another warning +is logged. These messages are only logged when a query is being routed and the +replication state changes. ### `use_sql_variables_in` diff --git a/include/maxscale/server.h b/include/maxscale/server.h index c60b4a841..4944909d5 100644 --- a/include/maxscale/server.h +++ b/include/maxscale/server.h @@ -94,6 +94,13 @@ typedef enum SERVER_TYPE_MYSQL } server_type_t; +typedef enum +{ + RLAG_NONE, + RLAG_BELOW_LIMIT, + RLAG_ABOVE_LIMIT +} RLAG_STATE; + static inline void server_decode_version(uint64_t version, SERVER_VERSION* server_version) { uint32_t major = version / 10000; @@ -159,11 +166,13 @@ typedef struct server * */ unsigned long node_ts; /**< Last timestamp set from M/S monitor module */ long master_id; /**< Master server id of this node */ + // Misc fields - bool master_err_is_logged; /**< If node failed, this indicates whether it is logged. Only - * used - * by rwsplit. TODO: Move to rwsplit */ - bool warn_ssl_not_enabled;/**< SSL not used for an SSL enabled server */ + bool master_err_is_logged; /**< If node failed, this indicates whether it is logged. Only used + * by rwsplit. TODO: Move to rwsplit */ + bool warn_ssl_not_enabled; /**< SSL not used for an SSL enabled server */ + RLAG_STATE rlag_state; /**< Is replication lag above or under limit? Used by rwsplit. */ + MxsDiskSpaceThreshold* disk_space_threshold;/**< Disk space thresholds */ } SERVER; diff --git a/server/core/server.cc b/server/core/server.cc index aef0b9f2e..6b7ff6678 100644 --- a/server/core/server.cc +++ b/server/core/server.cc @@ -183,6 +183,7 @@ SERVER* server_alloc(const char* name, MXS_CONFIG_PARAMETER* params) server->master_id = -1; server->master_err_is_logged = false; server->warn_ssl_not_enabled = true; + server->rlag_state = RLAG_NONE; server->disk_space_threshold = NULL; if (*monuser && *monpw) diff --git a/server/modules/routing/readwritesplit/rwsplit_route_stmt.cc b/server/modules/routing/readwritesplit/rwsplit_route_stmt.cc index 56fc1c2fb..0d0922b7c 100644 --- a/server/modules/routing/readwritesplit/rwsplit_route_stmt.cc +++ b/server/modules/routing/readwritesplit/rwsplit_route_stmt.cc @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -591,6 +592,38 @@ SRWBackend RWSplitSession::get_hinted_backend(char* name) return rval; } +/** + * Change server replication lag state and log warning when state changes. + * + * @param backend Affected server + * @param new_state New replication lag state + * @param max_rlag Maximum allowed lag. Used for the log message. + */ +static void change_rlag_state(SRWBackend& backend, RLAG_STATE new_state, int max_rlag) +{ + mxb_assert(new_state == RLAG_BELOW_LIMIT || new_state == RLAG_ABOVE_LIMIT); + namespace atom = maxbase::atomic; + auto srv = backend->server(); + auto old_state = atom::load(&srv->rlag_state, atom::RELAXED); + if (new_state != old_state) + { + atom::store(&srv->rlag_state, new_state, atom::RELAXED); + // State has just changed, log warning. Don't log catchup if old state was RLAG_NONE. + if (new_state == RLAG_ABOVE_LIMIT) + { + MXS_WARNING("Replication lag of '%s' is %is, which is above the configured limit %is. " + "'%s' is excluded from query routing.", + srv->name, srv->rlag, max_rlag, srv->name); + } + else if (old_state == RLAG_ABOVE_LIMIT) + { + MXS_WARNING("Replication lag of '%s' is %is, which is below the allowed limit %is. " + "'%s' is returned to query routing.", + srv->name, srv->rlag, max_rlag, srv->name); + } + } +} + SRWBackend RWSplitSession::get_slave_backend(int max_rlag) { // create a list of useable backends (includes masters, function name is a bit off), @@ -607,14 +640,25 @@ SRWBackend RWSplitSession::get_slave_backend(int max_rlag) && counts.second < m_router->max_slave_count(); bool master_or_slave = backend->is_master() || backend->is_slave(); - bool is_useable = backend->in_use() || can_take_slave_into_use; - bool not_a_slacker = rpl_lag_is_ok(backend, max_rlag); + bool is_usable = backend->in_use() || can_take_slave_into_use; + bool rlag_ok = rpl_lag_is_ok(backend, max_rlag); - bool server_is_candidate = master_or_slave && is_useable && not_a_slacker; - - if (server_is_candidate) + if (master_or_slave && is_usable) { - candidates.push_back(&backend); + if (rlag_ok) + { + candidates.push_back(&backend); + if (max_rlag > 0) + { + // Replication lag discrimination is on and the server passed. + change_rlag_state(backend, RLAG_BELOW_LIMIT, max_rlag); + } + } + else + { + // The server is otherwise usable except it's lagging too much. + change_rlag_state(backend, RLAG_ABOVE_LIMIT, max_rlag); + } } }