From 893059c53767a0340e5bf7f648522275fd95eaa3 Mon Sep 17 00:00:00 2001 From: Johan Wikman Date: Mon, 8 Apr 2019 15:35:24 +0300 Subject: [PATCH] MXS-2424 Use persisted nodes if bootstrap node missing At runtime the Clustrix monitor will save to an sqlite3 database information about detected nodes and delete that information if a node disappears. At startup, if the monitor fails to connect to a bootstrap node, it will try to connect any of the persisted nodes and start from there. This means that in general it is sufficient if the Clustrix monitor at the very first startup can connect to a bootstrap node; thereafter it will get by even if the bootstrap node would disappear for good. --- .../monitor/clustrixmon/clustrixmonitor.cc | 183 +++++++++++++----- .../monitor/clustrixmon/clustrixmonitor.hh | 16 +- .../monitor/clustrixmon/clustrixnode.hh | 74 +++++-- 3 files changed, 194 insertions(+), 79 deletions(-) diff --git a/server/modules/monitor/clustrixmon/clustrixmonitor.cc b/server/modules/monitor/clustrixmon/clustrixmonitor.cc index 503c6cde0..ecd8e7399 100644 --- a/server/modules/monitor/clustrixmon/clustrixmonitor.cc +++ b/server/modules/monitor/clustrixmon/clustrixmonitor.cc @@ -16,6 +16,7 @@ #include #include #include +#include #include "../../../core/internal/config_runtime.hh" #include "../../../core/internal/service.hh" @@ -51,6 +52,28 @@ static const char SQL_UPSERT_FORMAT[] = static const char SQL_DELETE_FORMAT[] = "DELETE FROM clustrix_nodes WHERE id = %d"; + +static const char SQL_SELECT[] = + "SELECT ip, mysql_port FROM clustrix_nodes"; + +using HostPortPair = std::pair; +using HostPortPairs = std::vector; + +// sqlite3 callback. +int select_cb(void* pData, int nColumns, char** ppColumn, char** ppNames) +{ + std::vector* pNodes = static_cast*>(pData); + + mxb_assert(nColumns == 2); + + std::string host(ppColumn[0]); + int port = atoi(ppColumn[1]); + + pNodes->emplace_back(host, port); + + return 0; +} + } namespace @@ -88,7 +111,7 @@ sqlite3* open_or_create_db(const std::string& path) if (unlink(path.c_str()) != 0) { MXS_ERROR("Failed to delete database %s that could not be properly " - "initialized. You should delete the database manually.", path.c_str()); + "initialized. It should be deleted manually.", path.c_str()); sqlite3_close_v2(pDb); pDb = nullptr; } @@ -282,10 +305,14 @@ void ClustrixMonitor::choose_hub(Clustrix::Softfailed softfailed) // then we check the bootstrap servers, and if (!choose_bootstrap_hub(softfailed, ips)) { - // finally, if all else fails, we check servers that have been persisted. - // In practise we will only get here at startup (no dynamic servers) - // if the bootstrap servers cannot be contacted. - choose_persisted_hub(softfailed, ips); + // finally, if all else fails - in practise we will only get here at + // startup (no dynamic servers) if the bootstrap servers cannot be + // contacted - we try to refresh the nodes using persisted information + if (refresh_using_persisted_nodes(ips)) + { + // and then select a hub from the dynamic ones. + choose_dynamic_hub(softfailed, ips); + } } } @@ -345,32 +372,108 @@ bool ClustrixMonitor::choose_bootstrap_hub(Clustrix::Softfailed softfailed, std: return m_pHub_con != nullptr; } -bool ClustrixMonitor::choose_persisted_hub(Clustrix::Softfailed softfailed, std::set& ips_checked) +bool ClustrixMonitor::refresh_using_persisted_nodes(std::set& ips_checked) { - // TODO: Check persisted servers. - return false; + MXS_NOTICE("Attempting to find a Clustrix bootstrap node from one of the nodes " + "used during the previous run of MaxScale."); + + bool refreshed = false; + + HostPortPairs nodes; + char* pError = nullptr; + int rv = sqlite3_exec(m_pDb, SQL_SELECT, select_cb, &nodes, &pError); + + if (rv == SQLITE_OK) + { + const std::string& username = m_settings.conn_settings.username; + const std::string& password = m_settings.conn_settings.password; + char* zPassword = decrypt_password(password.c_str()); + + auto it = nodes.begin(); + + while (!refreshed && (it != nodes.end())) + { + const auto& node = *it; + + const std::string& host = node.first; + + if (ips_checked.find(host) == ips_checked.end()) + { + ips_checked.insert(host); + int port = node.second; + + MXS_NOTICE("Trying to find out cluster nodes from %s:%d.", host.c_str(), port); + + MYSQL* pHub_con = mysql_init(NULL); + + if (mysql_real_connect(pHub_con, host.c_str(), + username.c_str(), zPassword, + nullptr, + port, nullptr, 0)) + { + if (Clustrix::is_part_of_the_quorum(name(), pHub_con)) + { + if (refresh_nodes(pHub_con)) + { + MXS_NOTICE("Cluster nodes refreshed."); + refreshed = true; + } + } + else + { + MXS_WARNING("%s:%d is not part of the quorum, ignoring.", host.c_str(), port); + } + } + else + { + MXS_WARNING("Could not connect to %s:%d.", host.c_str(), port); + } + + mysql_close(pHub_con); + } + + ++it; + } + + MXS_FREE(zPassword); + } + else + { + MXS_ERROR("Could not look up persisted nodes: %s", pError ? pError : "Unknown error"); + } + + return refreshed; } -void ClustrixMonitor::refresh_nodes() +bool ClustrixMonitor::refresh_nodes() { mxb_assert(m_pHub_con); + return refresh_nodes(m_pHub_con); +} + +bool ClustrixMonitor::refresh_nodes(MYSQL* pHub_con) +{ + mxb_assert(pHub_con); + map memberships; - if (check_cluster_membership(&memberships)) + bool refreshed = check_cluster_membership(pHub_con, &memberships); + + if (refreshed) { const char ZQUERY[] = "SELECT ni.nodeid, ni.iface_ip, ni.mysql_port, ni.healthmon_port, sn.nodeid " "FROM system.nodeinfo AS ni " "LEFT JOIN system.softfailed_nodes AS sn ON ni.nodeid = sn.nodeid"; - if (mysql_query(m_pHub_con, ZQUERY) == 0) + if (mysql_query(pHub_con, ZQUERY) == 0) { - MYSQL_RES* pResult = mysql_store_result(m_pHub_con); + MYSQL_RES* pResult = mysql_store_result(pHub_con); if (pResult) { - mxb_assert(mysql_field_count(m_pHub_con) == 5); + mxb_assert(mysql_field_count(pHub_con) == 5); set nids; for (const auto& element : m_nodes) @@ -404,25 +507,7 @@ void ClustrixMonitor::refresh_nodes() ClustrixNode& node = nit->second; - bool changed = false; - - if (node.ip() != ip) - { - node.set_ip(ip); - changed = true; - } - - if (node.mysql_port() != mysql_port) - { - node.set_mysql_port(mysql_port); - changed = true; - } - - if (node.health_port() != health_port) - { - node.set_health_port(health_port); - changed = true; - } + node.update(ip, mysql_port, health_port); bool is_draining = node.server()->is_draining(); @@ -443,11 +528,6 @@ void ClustrixMonitor::refresh_nodes() node.server()->clear_status(SERVER_DRAINING); } - if (changed) - { - persist_node(node); - } - nids.erase(id); } else if (mit != memberships.end()) @@ -473,11 +553,10 @@ void ClustrixMonitor::refresh_nodes() const ClustrixMembership& membership = mit->second; int health_check_threshold = m_config.health_check_threshold(); - ClustrixNode node(membership, ip, mysql_port, health_port, + ClustrixNode node(this, membership, ip, mysql_port, health_port, health_check_threshold, pServer); m_nodes.insert(make_pair(id, node)); - persist_node(node); // New server, so it needs to be added to all services that // use this monitor for defining its cluster of servers. @@ -517,7 +596,6 @@ void ClustrixMonitor::refresh_nodes() ClustrixNode& node = it->second; node.set_running(false, ClustrixNode::APPROACH_OVERRIDE); - unpersist_node(node); } vector health_urls; @@ -536,15 +614,17 @@ void ClustrixMonitor::refresh_nodes() else { MXS_WARNING("%s: No result returned for '%s' on %s.", - name(), ZQUERY, m_pHub_server->address); + name(), ZQUERY, mysql_get_host_info(pHub_con)); } } else { MXS_ERROR("%s: Could not execute '%s' on %s: %s", - name(), ZQUERY, m_pHub_server->address, mysql_error(m_pHub_con)); + name(), ZQUERY, mysql_get_host_info(pHub_con), mysql_error(pHub_con)); } } + + return refreshed; } void ClustrixMonitor::check_cluster(Clustrix::Softfailed softfailed) @@ -578,24 +658,23 @@ void ClustrixMonitor::check_hub(Clustrix::Softfailed softfailed) } } -bool ClustrixMonitor::check_cluster_membership(std::map* pMemberships) +bool ClustrixMonitor::check_cluster_membership(MYSQL* pHub_con, + std::map* pMemberships) { + mxb_assert(pHub_con); mxb_assert(pMemberships); - mxb_assert(m_pHub_con); - mxb_assert(m_pHub_server); - bool rv = false; const char ZQUERY[] = "SELECT nid, status, instance, substate FROM system.membership"; - if (mysql_query(m_pHub_con, ZQUERY) == 0) + if (mysql_query(pHub_con, ZQUERY) == 0) { - MYSQL_RES* pResult = mysql_store_result(m_pHub_con); + MYSQL_RES* pResult = mysql_store_result(pHub_con); if (pResult) { - mxb_assert(mysql_field_count(m_pHub_con) == 4); + mxb_assert(mysql_field_count(pHub_con) == 4); set nids; for (const auto& element : m_nodes) @@ -666,7 +745,7 @@ bool ClustrixMonitor::check_cluster_membership(std::map else { MXS_ERROR("%s: Could not execute '%s' on %s: %s", - name(), ZQUERY, m_pHub_server->address, mysql_error(m_pHub_con)); + name(), ZQUERY, mysql_get_host_info(pHub_con), mysql_error(pHub_con)); } return rv; @@ -894,7 +973,7 @@ bool ClustrixMonitor::perform_operation(Operation operation, return performed; } -void ClustrixMonitor::persist_node(const ClustrixNode& node) +void ClustrixMonitor::persist(const ClustrixNode& node) { if (!m_pDb) { @@ -923,7 +1002,7 @@ void ClustrixMonitor::persist_node(const ClustrixNode& node) } } -void ClustrixMonitor::unpersist_node(const ClustrixNode& node) +void ClustrixMonitor::unpersist(const ClustrixNode& node) { if (!m_pDb) { diff --git a/server/modules/monitor/clustrixmon/clustrixmonitor.hh b/server/modules/monitor/clustrixmon/clustrixmonitor.hh index 40a239a42..a0b33eb2a 100644 --- a/server/modules/monitor/clustrixmon/clustrixmonitor.hh +++ b/server/modules/monitor/clustrixmon/clustrixmonitor.hh @@ -21,7 +21,8 @@ #include "clustrixmembership.hh" #include "clustrixnode.hh" -class ClustrixMonitor : public maxscale::MonitorWorker +class ClustrixMonitor : public maxscale::MonitorWorker, + private ClustrixNode::Persister { ClustrixMonitor(const ClustrixMonitor&) = delete; ClustrixMonitor& operator=(const ClustrixMonitor&) = delete; @@ -91,10 +92,12 @@ private: bool choose_dynamic_hub(Clustrix::Softfailed softfailed, std::set& ips_checked); bool choose_bootstrap_hub(Clustrix::Softfailed softfailed, std::set& ips_checked); - bool choose_persisted_hub(Clustrix::Softfailed softfailed, std::set& ips_checked); + bool refresh_using_persisted_nodes(std::set& ips_checked); - void refresh_nodes(); - bool check_cluster_membership(std::map* pMemberships); + bool refresh_nodes(); + bool refresh_nodes(MYSQL* pHub_con); + bool check_cluster_membership(MYSQL* pHub_con, + std::map* pMemberships); void update_server_statuses(); @@ -136,8 +139,9 @@ private: return mxb::WorkerLoad::get_time_ms(); } - void persist_node(const ClustrixNode& node); - void unpersist_node(const ClustrixNode& node); + // ClustrixNode::Persister + void persist(const ClustrixNode& node); + void unpersist(const ClustrixNode& node); private: Config m_config; diff --git a/server/modules/monitor/clustrixmon/clustrixnode.hh b/server/modules/monitor/clustrixmon/clustrixnode.hh index 5f85caee0..086bbab47 100644 --- a/server/modules/monitor/clustrixmon/clustrixnode.hh +++ b/server/modules/monitor/clustrixmon/clustrixnode.hh @@ -22,6 +22,13 @@ class ClustrixNode { public: + class Persister + { + public: + virtual void persist(const ClustrixNode& node) = 0; + virtual void unpersist(const ClustrixNode& node) = 0; + }; + enum { DEFAULT_MYSQL_PORT = 3306, @@ -34,13 +41,15 @@ public: APPROACH_DEFAULT }; - ClustrixNode(const ClustrixMembership& membership, + ClustrixNode(Persister* pPersister, + const ClustrixMembership& membership, const std::string& ip, int mysql_port, int health_port, int health_check_threshold, SERVER* pServer) - : m_id(membership.id()) + : m_persister(*pPersister) + , m_id(membership.id()) , m_status(membership.status()) , m_substate(membership.substate()) , m_instance(membership.instance()) @@ -52,6 +61,8 @@ public: , m_pServer(pServer) , m_pCon(nullptr) { + m_pServer->set_status(SERVER_MASTER | SERVER_RUNNING); + m_persister.persist(*this); } ~ClustrixNode() @@ -87,33 +98,16 @@ public: return m_ip; } - void set_ip(const std::string& ip) - { - m_ip = ip; - m_pServer->server_update_address(ip); - } - int mysql_port() const { return m_mysql_port; } - void set_mysql_port(int port) - { - m_mysql_port = port; - m_pServer->update_port(port); - } - int health_port() const { return m_health_port; } - void set_health_port(int port) - { - m_health_port = port; - } - bool is_running() const { return m_nRunning > 0; @@ -123,9 +117,13 @@ public: { if (running) { - m_nRunning = m_health_check_threshold; + if (m_nRunning == 0) + { + m_pServer->set_status(SERVER_MASTER | SERVER_RUNNING); + m_persister.persist(*this); + } - m_pServer->set_status(SERVER_MASTER | SERVER_RUNNING); + m_nRunning = m_health_check_threshold; } else { @@ -143,11 +141,43 @@ public: if (m_nRunning == 0) { m_pServer->clear_status(SERVER_MASTER | SERVER_RUNNING); + m_persister.unpersist(*this); } } } } + void update(const std::string& ip, + int mysql_port, + int health_port) + { + bool changed = false; + + if (ip != m_ip) + { + m_ip = ip; + changed = true; + } + + if (mysql_port != m_mysql_port) + { + m_mysql_port = mysql_port; + m_pServer->update_port(m_mysql_port); + changed = true; + } + + if (health_port != m_health_port) + { + m_health_port = health_port; + changed = true; + } + + if (changed) + { + m_persister.persist(*this); + } + } + void update(Clustrix::Status status, Clustrix::SubState substate, int instance) { m_status = status; @@ -158,6 +188,7 @@ public: void deactivate_server() { m_pServer->is_active = false; + m_persister.unpersist(*this); } bool can_be_used_as_hub(const char* zName, @@ -194,6 +225,7 @@ public: } private: + Persister& m_persister; int m_id; Clustrix::Status m_status; Clustrix::SubState m_substate;