MXS-2223 Log a message when a slave is discriminated due to replication lag

Both the replication lag and the message printing state are saved in SERVER, although the values are mostly used by readwritesplit. A log message is printed both when a server goes over the limit and when it comes back below. Because of concurrency issues, a message may be printed multiple times before different threads detect the new message state. Documentation updated to explain the change.
2019-01-16 14:58:08 +02:00
parent 0dfdff1f95
commit 7f978f275f
4 changed files with 72 additions and 16 deletions
--- a/Documentation/Routers/ReadWriteSplit.md
+++ b/Documentation/Routers/ReadWriteSplit.md
@ -109,12 +109,14 @@ This feature is disabled by default.

 	max_slave_replication_lag=<allowed lag in seconds>

-This applies to Master/Slave replication with MySQL monitor and
-`detect_replication_lag=1` options set. max_slave_replication_lag must be
-greater than the monitor interval.
-
-This option only affects Master-Slave clusters. Galera clusters do not have a
-concept of slave lag even if the application of write sets might have lag.
+The Readwritesplit-router does not detect the replication lag itself. A monitor
+such as the MariaDB-monitor for a Master/Slave-cluster is required. This option
+only affects Master-Slave clusters. Galera clusters do not have a concept of
+slave lag even if the application of write sets might have lag. When a server is
+disqualified from routing because of replication lag, a warning is logged. Similarly,
+when the server has caught up enough to be a valid routing target, another warning
+is logged. These messages are only logged when a query is being routed and the
+replication state changes.

 ### `use_sql_variables_in`

--- a/include/maxscale/server.h
+++ b/include/maxscale/server.h
@ -94,6 +94,13 @@ typedef enum
    SERVER_TYPE_MYSQL
 } server_type_t;

+typedef enum
+{
+    RLAG_NONE,
+    RLAG_BELOW_LIMIT,
+    RLAG_ABOVE_LIMIT
+} RLAG_STATE;
+
 static inline void server_decode_version(uint64_t version, SERVER_VERSION* server_version)
 {
    uint32_t major = version / 10000;
@ -159,11 +166,13 @@ typedef struct server
                                                             * */
    unsigned long node_ts;                                  /**< Last timestamp set from M/S monitor module */
    long          master_id;                                /**< Master server id of this node */
+
    // Misc fields
-    bool master_err_is_logged;                  /**< If node failed, this indicates whether it is logged. Only
-                                                 * used
-                                                 *   by rwsplit. TODO: Move to rwsplit */
-    bool                   warn_ssl_not_enabled;/**< SSL not used for an SSL enabled server */
+    bool master_err_is_logged;    /**< If node failed, this indicates whether it is logged. Only used
+                                   *   by rwsplit. TODO: Move to rwsplit */
+    bool warn_ssl_not_enabled;    /**< SSL not used for an SSL enabled server */
+    RLAG_STATE rlag_state;        /**< Is replication lag above or under limit? Used by rwsplit. */
+
    MxsDiskSpaceThreshold* disk_space_threshold;/**< Disk space thresholds */
 } SERVER;

--- a/server/core/server.cc
+++ b/server/core/server.cc
@ -183,6 +183,7 @@ SERVER* server_alloc(const char* name, MXS_CONFIG_PARAMETER* params)
    server->master_id = -1;
    server->master_err_is_logged = false;
    server->warn_ssl_not_enabled = true;
+    server->rlag_state = RLAG_NONE;
    server->disk_space_threshold = NULL;

    if (*monuser && *monpw)
--- a/server/modules/routing/readwritesplit/rwsplit_route_stmt.cc
+++ b/server/modules/routing/readwritesplit/rwsplit_route_stmt.cc
@ -19,6 +19,7 @@
 #include <string.h>
 #include <strings.h>

+#include <maxbase/atomic.hh>
 #include <maxscale/alloc.h>
 #include <maxscale/clock.h>
 #include <maxscale/modutil.h>
@ -591,6 +592,38 @@ SRWBackend RWSplitSession::get_hinted_backend(char* name)
    return rval;
 }

+/**
+ * Change server replication lag state and log warning when state changes.
+ *
+ * @param backend Affected server
+ * @param new_state New replication lag state
+ * @param max_rlag Maximum allowed lag. Used for the log message.
+ */
+static void change_rlag_state(SRWBackend& backend, RLAG_STATE new_state, int max_rlag)
+{
+    mxb_assert(new_state == RLAG_BELOW_LIMIT || new_state == RLAG_ABOVE_LIMIT);
+    namespace atom = maxbase::atomic;
+    auto srv = backend->server();
+    auto old_state = atom::load(&srv->rlag_state, atom::RELAXED);
+    if (new_state != old_state)
+    {
+        atom::store(&srv->rlag_state, new_state, atom::RELAXED);
+        // State has just changed, log warning. Don't log catchup if old state was RLAG_NONE.
+        if (new_state == RLAG_ABOVE_LIMIT)
+        {
+            MXS_WARNING("Replication lag of '%s' is %is, which is above the configured limit %is. "
+                        "'%s' is excluded from query routing.",
+                        srv->name, srv->rlag, max_rlag, srv->name);
+        }
+        else if (old_state == RLAG_ABOVE_LIMIT)
+        {
+            MXS_WARNING("Replication lag of '%s' is %is, which is below the allowed limit %is. "
+                        "'%s' is returned to query routing.",
+                        srv->name, srv->rlag, max_rlag, srv->name);
+        }
+    }
+}
+
 SRWBackend RWSplitSession::get_slave_backend(int max_rlag)
 {
    // create a list of useable backends (includes masters, function name is a bit off),
@ -607,14 +640,25 @@ SRWBackend RWSplitSession::get_slave_backend(int max_rlag)
            && counts.second < m_router->max_slave_count();

        bool master_or_slave = backend->is_master() || backend->is_slave();
-        bool is_useable = backend->in_use() || can_take_slave_into_use;
-        bool not_a_slacker = rpl_lag_is_ok(backend, max_rlag);
+        bool is_usable = backend->in_use() || can_take_slave_into_use;
+        bool rlag_ok = rpl_lag_is_ok(backend, max_rlag);

-        bool server_is_candidate = master_or_slave && is_useable && not_a_slacker;
-
-        if (server_is_candidate)
+        if (master_or_slave && is_usable)
        {
-            candidates.push_back(&backend);
+            if (rlag_ok)
+            {
+                candidates.push_back(&backend);
+                if (max_rlag > 0)
+                {
+                    // Replication lag discrimination is on and the server passed.
+                    change_rlag_state(backend, RLAG_BELOW_LIMIT, max_rlag);
+                }
+            }
+            else
+            {
+                // The server is otherwise usable except it's lagging too much.
+                change_rlag_state(backend, RLAG_ABOVE_LIMIT, max_rlag);
+            }
        }
    }