diff --git a/Documentation/Monitors/Clustrix-Monitor.md b/Documentation/Monitors/Clustrix-Monitor.md index 9d67a1081..91e068da6 100644 --- a/Documentation/Monitors/Clustrix-Monitor.md +++ b/Documentation/Monitors/Clustrix-Monitor.md @@ -28,6 +28,26 @@ password=mypwd ``` +## Dynamic Servers + +The server objects the Clustrix monitor creates for each detected +Clustrix node will be named like +``` +@@:node- +``` +where `` is the name of the Clustrix monitor +instance, as defined in the MaxScale configuration file, and `` is the +id of the Clustrix node. + +For instance, with the Clustrix monitor defined as above and a Clustrix +cluster consisting of 3 nodes whose ids are `1`, `2` and `3` respectively, +the names of the created server objects will be: +``` +@@TheClustrixMonitor:node-1 +@@TheClustrixMonitor:node-2 +@@TheClustrixMonitor:node-3 +``` + ### Grants Note that the monitor user _must_ have `SELECT` grant on the following tables: @@ -37,8 +57,13 @@ Note that the monitor user _must_ have `SELECT` grant on the following tables: You can give the necessary grants using the following commands: ``` - grant select on system.membership to 'myuser'@'%'; - grant select on system.nodeinfo to 'myuser'@'%'; + GRANT SELECT ON system.membership TO 'myuser'@'%'; + GRANT SELECT ON system.nodeinfo TO 'myuser'@'%'; +``` +Further, if you want be able to _softfail_ and _unsoftfail_a node via MaxScale, +then the monitor user must have `SUPER` privileges, which can be granted like: +``` + GRANT SUPER ON *.* TO 'myuser'@'%'; ``` The user name must be changed to the one actually being used. @@ -69,3 +94,45 @@ considers a particular node to be down. The default value is 2. ``` health_check_threshold=3 ``` + +## Commands + +The Clustrix monitor supports the following module commands. + +### `softfail` + +With the `softfail` module command, a node can be _softfailed_ via +MaxScale. The command requires as argument the name of the Clustrix +monitor instance (as defined in the configuration file) and the name +of the node to be softfailed. + +For instance, with a configuration file like +``` +[TheClustrixMonitor] +type=monitor +module=clustrixmon +... +``` +then the node whose server name is `@@TheClustrixMonitor:node-1` can +be softfailed like +``` +$ maxctrl call command clustrixmon softfail TheClustrixMonitor @@TheClustrixMonitor:node-1 +``` +If a node is successfully softfailed, then the status of the corresponding +MaxScale server object will be set to `Being Drained`, which will prevent +new connections from being created to the node. + +### `unsoftfail` + +With the `unsoftfail` module command, a node can be _unsoftfailed_ via +MaxScale. The command requires as argument the name of the Clustrix +monitor instance (as defined in the configuration file) and the name +of the node to be unsoftfailed. + +With a setup similar to the `softfail` case, a node can be unsoftfailed +like: +``` +$ maxctrl call command clustrixmon unsoftfail TheClustrixMonitor @@TheClustrixMonitor:node-1 +``` +If a node is successfully softfailed, then a `Being Drained` status of +the corresponding MaxScale server object will be cleared. diff --git a/server/modules/monitor/clustrixmon/clustrixmonitor.cc b/server/modules/monitor/clustrixmon/clustrixmonitor.cc index 870354d0d..d9d3558d8 100644 --- a/server/modules/monitor/clustrixmon/clustrixmonitor.cc +++ b/server/modules/monitor/clustrixmon/clustrixmonitor.cc @@ -14,11 +14,21 @@ #include "clustrixmonitor.hh" #include #include +#include #include "../../../core/internal/config_runtime.hh" namespace http = mxb::http; using namespace std; +#define LOG_JSON_ERROR(ppJson, format, ...) \ + do { \ + MXS_ERROR(format, ##__VA_ARGS__); \ + if (ppJson) \ + { \ + *ppJson = mxs_json_error_append(*ppJson, format, ##__VA_ARGS__); \ + } \ + } while (false) + namespace { @@ -57,13 +67,45 @@ bool ClustrixMonitor::configure(const MXS_CONFIG_PARAMETER* pParams) bool ClustrixMonitor::softfail(SERVER* pServer, json_t** ppError) { - MXS_NOTICE("Should softfail %s.", pServer->address); + bool rv = false; + + if (is_running()) + { + call([this, pServer, ppError, &rv]() { + rv = perform_softfail(pServer, ppError); + }, + EXECUTE_QUEUED); + } + else + { + LOG_JSON_ERROR(ppError, + "%s: The monitor is not running and hence " + "SOFTFAIL cannot be performed for %s.", + m_name, pServer->address); + } + return true; } bool ClustrixMonitor::unsoftfail(SERVER* pServer, json_t** ppError) { - MXS_NOTICE("Should unsoftfail %s.", pServer->address); + bool rv = false; + + if (is_running()) + { + call([this, pServer, ppError, &rv]() { + rv = perform_unsoftfail(pServer, ppError); + }, + EXECUTE_QUEUED); + } + else + { + LOG_JSON_ERROR(ppError, + "%s: The monitor is not running and hence " + "UNSOFTFAIL cannot be performed for %s.", + m_name, pServer->address); + } + return true; } @@ -217,7 +259,7 @@ void ClustrixMonitor::refresh_nodes() // '@@' ensures no clash with user created servers. // Monitor name ensures no clash with other Clustrix monitor instances. - string name = string("@@") + m_name + ":server-" + std::to_string(id); + string name = string("@@") + m_name + ":node-" + std::to_string(id); auto nit = m_nodes.find(id); auto mit = memberships.find(id); @@ -566,3 +608,93 @@ bool ClustrixMonitor::check_http(Call::action_t action) return false; } + +bool ClustrixMonitor::perform_softfail(SERVER* pServer, json_t** ppError) +{ + return perform_operation(Operation::SOFTFAIL, pServer, ppError); +} + +bool ClustrixMonitor::perform_unsoftfail(SERVER* pServer, json_t** ppError) +{ + return perform_operation(Operation::UNSOFTFAIL, pServer, ppError); +} + +bool ClustrixMonitor::perform_operation(Operation operation, + SERVER* pServer, + json_t** ppError) +{ + bool performed = false; + + const char ZSOFTFAIL[] = "SOFTFAIL"; + const char ZUNSOFTFAIL[] = "UNSOFTFAIL"; + + const char* zOperation = (operation == Operation::SOFTFAIL) ? ZSOFTFAIL : ZUNSOFTFAIL; + + if (!m_pHub_con) + { + check_hub_and_refresh_nodes(); + } + + if (m_pHub_con) + { + auto it = find_if(m_nodes.begin(), m_nodes.end(), + [pServer] (const std::pair& element) { + return element.second.server() == pServer; + }); + + if (it != m_nodes.end()) + { + ClustrixNode& node = it->second; + + const char ZQUERY_FORMAT[] = "ALTER CLUSTER %s %d"; + + int id = node.id(); + char zQuery[sizeof(ZQUERY_FORMAT) + sizeof(ZUNSOFTFAIL) + UINTLEN(id)]; // ZUNSOFTFAIL is longer + + sprintf(zQuery, ZQUERY_FORMAT, zOperation, id); + + if (mysql_query(m_pHub_con, zQuery) == 0) + { + MXS_NOTICE("Clustrix monitor %s performed %s on node %d (%s).", + m_name, zOperation, id, pServer->address); + + if (operation == Operation::SOFTFAIL) + { + MXS_NOTICE("%s: Turning on 'Being Drained' on server %s.", + m_name, pServer->address); + pServer->set_status(SERVER_BEING_DRAINED); + } + else + { + mxb_assert(operation == Operation::UNSOFTFAIL); + + MXS_NOTICE("%s: Turning off 'Being Drained' on server %s.", + m_name, pServer->address); + pServer->clear_status(SERVER_BEING_DRAINED); + } + } + else + { + LOG_JSON_ERROR(ppError, + "%s: The execution of '%s' failed: %s", + m_name, zQuery, mysql_error(m_pHub_con)); + } + } + else + { + LOG_JSON_ERROR(ppError, + "%s: The server %s is not being monitored, " + "cannot perform %s.", + m_name, pServer->address, zOperation); + } + } + else + { + LOG_JSON_ERROR(ppError, + "%s: Could not could not connect to any Clustrix node, " + "cannot perform %s of %s.", + m_name, zOperation, pServer->address); + } + + return performed; +} diff --git a/server/modules/monitor/clustrixmon/clustrixmonitor.hh b/server/modules/monitor/clustrixmon/clustrixmonitor.hh index 4b97918b3..1de58da55 100644 --- a/server/modules/monitor/clustrixmon/clustrixmonitor.hh +++ b/server/modules/monitor/clustrixmon/clustrixmonitor.hh @@ -87,6 +87,20 @@ private: void initiate_delayed_http_check(); bool check_http(Call::action_t action); + bool perform_softfail(SERVER* pServer, json_t** ppError); + bool perform_unsoftfail(SERVER* pServer, json_t** ppError); + + enum class Operation + { + SOFTFAIL, + UNSOFTFAIL, + }; + + bool perform_operation(Operation operation, + SERVER* pServer, + json_t** ppError); + + static long now() { return mxb::WorkerLoad::get_time_ms();