From b582119d27a64e4b8a7d4588488efe1ac521461e Mon Sep 17 00:00:00 2001
From: Johan Wikman <johan.wikman@mariadb.com>
Date: Fri, 1 Feb 2019 09:51:37 +0200
Subject: [PATCH] MXS-2275 Check for softfailed nodes

When checking the node info, also include information about wheter
a node is being SOFTFAILed. If it is, turn on the `Being Drained`
bit.

A node is SOFTFAILed with the intention of removing it, so better
not to create new connections to it as they later would be broken
when the node is actually taken down.
---
 Documentation/Monitors/Clustrix-Monitor.md    | 18 +++++++++++
 .../monitor/clustrixmon/clustrixmonitor.cc    | 31 +++++++++++++++++--
 2 files changed, 47 insertions(+), 2 deletions(-)
diff --git a/Documentation/Monitors/Clustrix-Monitor.md b/Documentation/Monitors/Clustrix-Monitor.md
index 91e068da6..db0708588 100644
--- a/Documentation/Monitors/Clustrix-Monitor.md
+++ b/Documentation/Monitors/Clustrix-Monitor.md
@@ -54,11 +54,13 @@ Note that the monitor user _must_ have `SELECT` grant on the following tables:
 
    * `system.nodeinfo`
    * `system.membership`
+   * `system.softfailed_nodes`
 
 You can give the necessary grants using the following commands:
 ```
     GRANT SELECT ON system.membership TO 'myuser'@'%';
     GRANT SELECT ON system.nodeinfo TO 'myuser'@'%';
+    GRANT SELECT ON system.softfailed_nodes TO 'myuser'@'%';
 ```
 Further, if you want be able to _softfail_ and _unsoftfail_a node via MaxScale,
 then the monitor user must have `SUPER` privileges, which can be granted like:
@@ -136,3 +138,19 @@ $ maxctrl call command clustrixmon unsoftfail TheClustrixMonitor @@TheClustrixMo
 ```
 If a node is successfully softfailed, then a `Being Drained` status of
 the corresponding MaxScale server object will be cleared.
+
+## SOFTFAILed nodes
+
+During the cluster check, which is performed once per
+`cluster_monitor_interval`, the monitor will also check whether any
+nodes are being softfailed. The status of the corresponding server
+object of a node being softfailed will be set to `Being Drained`,
+which will prevent new connections from being created to that node.
+
+If a node that was softfailed is UNSOFTFAILed then the `Being Drained`
+status will be cleared.
+
+If the softfailing and unsoftfailing is initiated using the `softfail`
+and `unsoftfail` commands of the Clustrix monitor, then there will be
+no delay between the softfailing or unsoftfailing being initated and the
+`Being Drained` status being turned on/off.
diff --git a/server/modules/monitor/clustrixmon/clustrixmonitor.cc b/server/modules/monitor/clustrixmon/clustrixmonitor.cc
index 329833e4b..771b9ad55 100644
--- a/server/modules/monitor/clustrixmon/clustrixmonitor.cc
+++ b/server/modules/monitor/clustrixmon/clustrixmonitor.cc
@@ -230,7 +230,9 @@ void ClustrixMonitor::refresh_nodes()
 
     if (check_cluster_membership(&memberships))
     {
-        const char ZQUERY[] = "SELECT nodeid, iface_ip, mysql_port, healthmon_port FROM system.nodeinfo";
+        const char ZQUERY[] =
+            "SELECT ni.nodeid, ni.iface_ip, ni.mysql_port, ni.healthmon_port, sn.nodeid FROM system.nodeinfo AS ni "
+            "LEFT JOIN system.softfailed_nodes AS sn ON ni.nodeid = sn.nodeid";
 
         if (mysql_query(m_pHub_con, ZQUERY) == 0)
         {
@@ -238,7 +240,7 @@ void ClustrixMonitor::refresh_nodes()
 
             if (pResult)
             {
-                mxb_assert(mysql_field_count(m_pHub_con) == 4);
+                mxb_assert(mysql_field_count(m_pHub_con) == 5);
 
                 set<int> nids;
                 for (const auto& element : m_nodes)
@@ -256,6 +258,7 @@ void ClustrixMonitor::refresh_nodes()
                         string ip = row[1];
                         int mysql_port = row[2] ? atoi(row[2]) : DEFAULT_MYSQL_PORT;
                         int health_port = row[3] ? atoi(row[3]) : DEFAULT_HEALTH_PORT;
+                        bool softfailed = row[4] ? true : false;
 
                         // '@@' ensures no clash with user created servers.
                         // Monitor name ensures no clash with other Clustrix monitor instances.
@@ -286,6 +289,23 @@ void ClustrixMonitor::refresh_nodes()
                                 node.set_health_port(health_port);
                             }
 
+                            bool is_being_drained = node.server()->is_being_drained();
+
+                            if (softfailed && !is_being_drained)
+                            {
+                                MXS_NOTICE("%s: Node %d (%s) has been SOFTFAILed. Turning ON 'Being Drained'.",
+                                           m_name, node.id(), node.server()->address);
+
+                                node.server()->set_status(SERVER_BEING_DRAINED);
+                            }
+                            else if (!softfailed && is_being_drained)
+                            {
+                                MXS_NOTICE("%s: Node %d (%s) is no longer being SOFTFAILed. Turning OFF 'Being Drained'.",
+                                           m_name, node.id(), node.server()->address);
+
+                                node.server()->clear_status(SERVER_BEING_DRAINED);
+                            }
+
                             nids.erase(id);
                         }
                         else if (mit != memberships.end())
@@ -303,6 +323,11 @@ void ClustrixMonitor::refresh_nodes()
                                 SERVER* pServer = SERVER::find_by_unique_name(name);
                                 mxb_assert(pServer);
 
+                                if (softfailed)
+                                {
+                                    pServer->set_status(SERVER_BEING_DRAINED);
+                                }
+
                                 const ClustrixMembership& membership = mit->second;
                                 int health_check_threshold = m_config.health_check_threshold();
 
@@ -336,6 +361,8 @@ void ClustrixMonitor::refresh_nodes()
 
                 mysql_free_result(pResult);
 
+                // Any nodes that were not found are not available, so their
+                // state must be set accordingly.
                 for (const auto nid : nids)
                 {
                     auto it = m_nodes.find(nid);