MXS-2012 Write replication lag to SERVER

Allows routers to read the value.
2018-08-20 15:25:01 +03:00
parent 44a57dbefd
commit 03cefcc4ac
7 changed files with 39 additions and 19 deletions
--- a/server/modules/monitor/mariadbmon/cluster_discovery.cc
+++ b/server/modules/monitor/mariadbmon/cluster_discovery.cc
@ -406,6 +406,7 @@ MariaDBServer* MariaDBMonitor::find_master_inside_cycle(ServerArray& cycle_membe

 /**
 * Assign replication role status bits to the servers in the cluster. Starts from the cluster master server.
+ * Also updates replication lag.
 */
 void MariaDBMonitor::assign_server_roles()
 {
@ -416,6 +417,7 @@ void MariaDBMonitor::assign_server_roles()
    for (auto server : m_servers)
    {
        server->clear_status(remove_bits);
+        server->m_replication_lag = MXS_RLAG_UNDEFINED;
    }

    // Check the the master node, label it as the [Master] if
@ -425,6 +427,8 @@ void MariaDBMonitor::assign_server_roles()
    {
        if (m_master->is_running())
        {
+            // Master gets replication lag 0 even if it's replicating from an external server.
+            m_master->m_replication_lag = 0;
            if (m_master->is_read_only())
            {
                // Special case: read_only is ON on a running master but there is no alternative master.
@ -464,7 +468,7 @@ void MariaDBMonitor::assign_server_roles()

 /**
 * Check if the servers replicating from the given node qualify for [Slave] and mark them. Continue the
- * search to any found slaves.
+ * search to any found slaves. Also updates replication lag.
 *
 * @param start_node The root master node where the search begins. The node itself is not marked [Slave].
 */
@ -557,6 +561,16 @@ void MariaDBMonitor::assign_slave_and_relay_master(MariaDBServer* start_node)
                    if (slave->is_running())
                    {
                        slave->set_status(SERVER_SLAVE);
+                        // Write the replication lag for this slave. It may have multiple slave connections,
+                        // in which case take the smallest value. This only counts the slave connections
+                        // leading to the master or a relay.
+                        int curr_rlag = slave->m_replication_lag;
+                        int new_rlag = sstatus->seconds_behind_master;
+                        if (new_rlag != MXS_RLAG_UNDEFINED &&
+                            (curr_rlag == MXS_RLAG_UNDEFINED || new_rlag < curr_rlag))
+                        {
+                            slave->m_replication_lag = new_rlag;
+                        }
                    }
                }
            }
--- a/server/modules/monitor/mariadbmon/mariadbmon.cc
+++ b/server/modules/monitor/mariadbmon/mariadbmon.cc
@ -468,9 +468,11 @@ void MariaDBMonitor::tick()

    // Update shared status. The next functions read the shared status. TODO: change the following
    // functions to read "pending_status" instead.
-    for (auto mon_srv = m_monitor->monitored_servers; mon_srv; mon_srv = mon_srv->next)
+    for (auto server : m_servers)
    {
-        mon_srv->server->status = mon_srv->pending_status;
+        SERVER* srv = server->m_server_base->server;
+        srv->rlag = server->m_replication_lag;
+        srv->status = server->m_server_base->pending_status;
    }

    log_master_changes();
--- a/server/modules/monitor/mariadbmon/mariadbserver.cc
+++ b/server/modules/monitor/mariadbmon/mariadbserver.cc
@ -46,6 +46,7 @@ MariaDBServer::MariaDBServer(MXS_MONITORED_SERVER* monitored_server, int config_
    , m_latest_event(time(NULL))
    , m_gtid_domain_id(GTID_DOMAIN_UNKNOWN)
    , m_topology_changed(true)
+    , m_replication_lag(MXS_RLAG_UNDEFINED)
    , m_print_update_errormsg(true)
 {
    ss_dassert(monitored_server);
@ -188,7 +189,11 @@ bool MariaDBServer::do_show_slave_status(string* errmsg_out)
            SlaveStatus::slave_io_from_string(result->get_string(i_slave_io_running));
        sstatus_row.slave_sql_running = (result->get_string(i_slave_sql_running) == "Yes");
        sstatus_row.master_server_id = result->get_uint(i_master_server_id);
-        sstatus_row.seconds_behind_master = result->get_uint(i_seconds_behind_master);
+
+        auto rlag = result->get_uint(i_seconds_behind_master);
+        // If slave connection is stopped, the value given by the backend is null -> -1.
+        sstatus_row.seconds_behind_master = (rlag < 0) ? MXS_RLAG_UNDEFINED :
+            (rlag > INT_MAX) ? INT_MAX : rlag;

        if (sstatus_row.slave_io_running == SlaveStatus::SLAVE_IO_YES && sstatus_row.slave_sql_running)
        {
@ -1175,7 +1180,7 @@ string SlaveStatus::to_string() const
                                          slave_sql_running ? "Yes" : "No");

    string rval = string_printf(
-            "  Host: %22s, IO/SQL running: %7s, Master ID: %4" PRId64 ", Gtid_IO_Pos: %s, R.Lag: %" PRId64,
+            "  Host: %22s, IO/SQL running: %7s, Master ID: %4" PRId64 ", Gtid_IO_Pos: %s, R.Lag: %d",
            host_port.c_str(), running_states.c_str(), master_server_id,
            gtid_io_pos.to_string().c_str(), seconds_behind_master);
    return rval;
--- a/server/modules/monitor/mariadbmon/mariadbserver.hh
+++ b/server/modules/monitor/mariadbmon/mariadbserver.hh
@ -53,7 +53,8 @@ public:
    bool slave_sql_running = false;                     /* Slave SQL thread running state, true if "Yes" */
    GtidList gtid_io_pos;                               /* Gtid I/O position of the slave thread. */
    std::string last_error;                             /* Last IO or SQL error encountered. */
-    int64_t seconds_behind_master = 0;                  /* How much behind the slave is. */
+    int seconds_behind_master = MXS_RLAG_UNDEFINED;     /* How much behind the slave is. */
+
    std::string to_string() const;
    static slave_io_running_t slave_io_from_string(const std::string& str);
    static std::string slave_io_to_string(slave_io_running_t slave_io);
@ -151,6 +152,8 @@ public:
    bool            m_topology_changed;     /**< Has anything that could affect replication topology changed
                                              *  this iteration? Causes: server id, slave connections,
                                              *  read-only. */
+    int             m_replication_lag;      /**< Replication lag of the server. Used during calculation so
+                                              *  that the actual SERVER struct is only written to once. */
    NodeData        m_node;                 /**< Replication topology data */
    SlaveStatusArray m_slave_status;        /**< Data returned from SHOW SLAVE STATUS */
    ReplicationSettings m_rpl_settings;     /**< Miscellaneous replication related settings. These are not