MXS-2223 Log a message when a slave is discriminated due to replication lag

Both the replication lag and the message printing state are saved in SERVER, although the values are mostly used by readwritesplit. A log message is printed both when a server goes over the limit and when it comes back below. Because of concurrency issues, a message may be printed multiple times before different threads detect the new message state. Documentation updated to explain the change.
2019-01-16 14:58:08 +02:00
parent 0dfdff1f95
commit 7f978f275f
4 changed files with 72 additions and 16 deletions
--- a/Documentation/Routers/ReadWriteSplit.md
+++ b/Documentation/Routers/ReadWriteSplit.md
@ -109,12 +109,14 @@ This feature is disabled by default.
 	max_slave_replication_lag=<allowed lag in seconds>
-This applies to Master/Slave replication with MySQL monitor and
+The Readwritesplit-router does not detect the replication lag itself. A monitor
-`detect_replication_lag=1` options set. max_slave_replication_lag must be
+such as the MariaDB-monitor for a Master/Slave-cluster is required. This option
-greater than the monitor interval.
+only affects Master-Slave clusters. Galera clusters do not have a concept of
-
+slave lag even if the application of write sets might have lag. When a server is
-This option only affects Master-Slave clusters. Galera clusters do not have a
+disqualified from routing because of replication lag, a warning is logged. Similarly,
-concept of slave lag even if the application of write sets might have lag.
+when the server has caught up enough to be a valid routing target, another warning
 is logged. These messages are only logged when a query is being routed and the
 replication state changes.
 ### `use_sql_variables_in`
--- a/include/maxscale/server.h
+++ b/include/maxscale/server.h
@ -94,6 +94,13 @@ typedef enum
    SERVER_TYPE_MYSQL
 } server_type_t;
 typedef enum
 {
    RLAG_NONE,
    RLAG_BELOW_LIMIT,
    RLAG_ABOVE_LIMIT
 } RLAG_STATE;
 static inline void server_decode_version(uint64_t version, SERVER_VERSION* server_version)
 {
    uint32_t major = version / 10000;
@ -159,11 +166,13 @@ typedef struct server
                                                             * */
    unsigned long node_ts;                                  /**< Last timestamp set from M/S monitor module */
    long          master_id;                                /**< Master server id of this node */
    // Misc fields
-    bool master_err_is_logged;                  /**< If node failed, this indicates whether it is logged. Only
+    bool master_err_is_logged;    /**< If node failed, this indicates whether it is logged. Only used
                                                 * used
                                   *   by rwsplit. TODO: Move to rwsplit */
    bool warn_ssl_not_enabled;    /**< SSL not used for an SSL enabled server */
    RLAG_STATE rlag_state;        /**< Is replication lag above or under limit? Used by rwsplit. */
    MxsDiskSpaceThreshold* disk_space_threshold;/**< Disk space thresholds */
 } SERVER;
--- a/server/core/server.cc
+++ b/server/core/server.cc
@ -183,6 +183,7 @@ SERVER* server_alloc(const char* name, MXS_CONFIG_PARAMETER* params)
    server->master_id = -1;
    server->master_err_is_logged = false;
    server->warn_ssl_not_enabled = true;
    server->rlag_state = RLAG_NONE;
    server->disk_space_threshold = NULL;
    if (*monuser && *monpw)
--- a/server/modules/routing/readwritesplit/rwsplit_route_stmt.cc
+++ b/server/modules/routing/readwritesplit/rwsplit_route_stmt.cc
@ -19,6 +19,7 @@
 #include <string.h>
 #include <strings.h>
 #include <maxbase/atomic.hh>
 #include <maxscale/alloc.h>
 #include <maxscale/clock.h>
 #include <maxscale/modutil.h>
@ -591,6 +592,38 @@ SRWBackend RWSplitSession::get_hinted_backend(char* name)
    return rval;
 }
 /**
 * Change server replication lag state and log warning when state changes.
 *
 * @param backend Affected server
 * @param new_state New replication lag state
 * @param max_rlag Maximum allowed lag. Used for the log message.
 */
 static void change_rlag_state(SRWBackend& backend, RLAG_STATE new_state, int max_rlag)
 {
    mxb_assert(new_state == RLAG_BELOW_LIMIT || new_state == RLAG_ABOVE_LIMIT);
    namespace atom = maxbase::atomic;
    auto srv = backend->server();
    auto old_state = atom::load(&srv->rlag_state, atom::RELAXED);
    if (new_state != old_state)
    {
        atom::store(&srv->rlag_state, new_state, atom::RELAXED);
        // State has just changed, log warning. Don't log catchup if old state was RLAG_NONE.
        if (new_state == RLAG_ABOVE_LIMIT)
        {
            MXS_WARNING("Replication lag of '%s' is %is, which is above the configured limit %is. "
                        "'%s' is excluded from query routing.",
                        srv->name, srv->rlag, max_rlag, srv->name);
        }
        else if (old_state == RLAG_ABOVE_LIMIT)
        {
            MXS_WARNING("Replication lag of '%s' is %is, which is below the allowed limit %is. "
                        "'%s' is returned to query routing.",
                        srv->name, srv->rlag, max_rlag, srv->name);
        }
    }
 }
 SRWBackend RWSplitSession::get_slave_backend(int max_rlag)
 {
    // create a list of useable backends (includes masters, function name is a bit off),
@ -607,14 +640,25 @@ SRWBackend RWSplitSession::get_slave_backend(int max_rlag)
            && counts.second < m_router->max_slave_count();
        bool master_or_slave = backend->is_master() || backend->is_slave();
-        bool is_useable = backend->in_use() || can_take_slave_into_use;
+        bool is_usable = backend->in_use() || can_take_slave_into_use;
-        bool not_a_slacker = rpl_lag_is_ok(backend, max_rlag);
+        bool rlag_ok = rpl_lag_is_ok(backend, max_rlag);
-        bool server_is_candidate = master_or_slave && is_useable && not_a_slacker;
+        if (master_or_slave && is_usable)
-
+        {
-        if (server_is_candidate)
+            if (rlag_ok)
            {
                candidates.push_back(&backend);
                if (max_rlag > 0)
                {
                    // Replication lag discrimination is on and the server passed.
                    change_rlag_state(backend, RLAG_BELOW_LIMIT, max_rlag);
                }
            }
            else
            {
                // The server is otherwise usable except it's lagging too much.
                change_rlag_state(backend, RLAG_ABOVE_LIMIT, max_rlag);
            }
        }
    }