MXS-2424 Use persisted nodes if bootstrap node missing

At runtime the Clustrix monitor will save to an sqlite3
database information about detected nodes and delete that
information if a node disappears.

At startup, if the monitor fails to connect to a bootstrap
node, it will try to connect any of the persisted nodes and
start from there.

This means that in general it is sufficient if the Clustrix
monitor at the very first startup can connect to a bootstrap
node; thereafter it will get by even if the bootstrap node
would disappear for good.
This commit is contained in:
Johan Wikman
2019-04-08 15:35:24 +03:00
parent 164ca7b5f1
commit 893059c537
3 changed files with 194 additions and 79 deletions

View File

@ -16,6 +16,7 @@
#include <set> #include <set>
#include <maxscale/json_api.hh> #include <maxscale/json_api.hh>
#include <maxscale/paths.h> #include <maxscale/paths.h>
#include <maxscale/secrets.h>
#include "../../../core/internal/config_runtime.hh" #include "../../../core/internal/config_runtime.hh"
#include "../../../core/internal/service.hh" #include "../../../core/internal/service.hh"
@ -51,6 +52,28 @@ static const char SQL_UPSERT_FORMAT[] =
static const char SQL_DELETE_FORMAT[] = static const char SQL_DELETE_FORMAT[] =
"DELETE FROM clustrix_nodes WHERE id = %d"; "DELETE FROM clustrix_nodes WHERE id = %d";
static const char SQL_SELECT[] =
"SELECT ip, mysql_port FROM clustrix_nodes";
using HostPortPair = std::pair<std::string, int>;
using HostPortPairs = std::vector<HostPortPair>;
// sqlite3 callback.
int select_cb(void* pData, int nColumns, char** ppColumn, char** ppNames)
{
std::vector<HostPortPair>* pNodes = static_cast<std::vector<HostPortPair>*>(pData);
mxb_assert(nColumns == 2);
std::string host(ppColumn[0]);
int port = atoi(ppColumn[1]);
pNodes->emplace_back(host, port);
return 0;
}
} }
namespace namespace
@ -88,7 +111,7 @@ sqlite3* open_or_create_db(const std::string& path)
if (unlink(path.c_str()) != 0) if (unlink(path.c_str()) != 0)
{ {
MXS_ERROR("Failed to delete database %s that could not be properly " MXS_ERROR("Failed to delete database %s that could not be properly "
"initialized. You should delete the database manually.", path.c_str()); "initialized. It should be deleted manually.", path.c_str());
sqlite3_close_v2(pDb); sqlite3_close_v2(pDb);
pDb = nullptr; pDb = nullptr;
} }
@ -282,10 +305,14 @@ void ClustrixMonitor::choose_hub(Clustrix::Softfailed softfailed)
// then we check the bootstrap servers, and // then we check the bootstrap servers, and
if (!choose_bootstrap_hub(softfailed, ips)) if (!choose_bootstrap_hub(softfailed, ips))
{ {
// finally, if all else fails, we check servers that have been persisted. // finally, if all else fails - in practise we will only get here at
// In practise we will only get here at startup (no dynamic servers) // startup (no dynamic servers) if the bootstrap servers cannot be
// if the bootstrap servers cannot be contacted. // contacted - we try to refresh the nodes using persisted information
choose_persisted_hub(softfailed, ips); if (refresh_using_persisted_nodes(ips))
{
// and then select a hub from the dynamic ones.
choose_dynamic_hub(softfailed, ips);
}
} }
} }
@ -345,32 +372,108 @@ bool ClustrixMonitor::choose_bootstrap_hub(Clustrix::Softfailed softfailed, std:
return m_pHub_con != nullptr; return m_pHub_con != nullptr;
} }
bool ClustrixMonitor::choose_persisted_hub(Clustrix::Softfailed softfailed, std::set<string>& ips_checked) bool ClustrixMonitor::refresh_using_persisted_nodes(std::set<string>& ips_checked)
{ {
// TODO: Check persisted servers. MXS_NOTICE("Attempting to find a Clustrix bootstrap node from one of the nodes "
return false; "used during the previous run of MaxScale.");
bool refreshed = false;
HostPortPairs nodes;
char* pError = nullptr;
int rv = sqlite3_exec(m_pDb, SQL_SELECT, select_cb, &nodes, &pError);
if (rv == SQLITE_OK)
{
const std::string& username = m_settings.conn_settings.username;
const std::string& password = m_settings.conn_settings.password;
char* zPassword = decrypt_password(password.c_str());
auto it = nodes.begin();
while (!refreshed && (it != nodes.end()))
{
const auto& node = *it;
const std::string& host = node.first;
if (ips_checked.find(host) == ips_checked.end())
{
ips_checked.insert(host);
int port = node.second;
MXS_NOTICE("Trying to find out cluster nodes from %s:%d.", host.c_str(), port);
MYSQL* pHub_con = mysql_init(NULL);
if (mysql_real_connect(pHub_con, host.c_str(),
username.c_str(), zPassword,
nullptr,
port, nullptr, 0))
{
if (Clustrix::is_part_of_the_quorum(name(), pHub_con))
{
if (refresh_nodes(pHub_con))
{
MXS_NOTICE("Cluster nodes refreshed.");
refreshed = true;
}
}
else
{
MXS_WARNING("%s:%d is not part of the quorum, ignoring.", host.c_str(), port);
}
}
else
{
MXS_WARNING("Could not connect to %s:%d.", host.c_str(), port);
}
mysql_close(pHub_con);
}
++it;
}
MXS_FREE(zPassword);
}
else
{
MXS_ERROR("Could not look up persisted nodes: %s", pError ? pError : "Unknown error");
}
return refreshed;
} }
void ClustrixMonitor::refresh_nodes() bool ClustrixMonitor::refresh_nodes()
{ {
mxb_assert(m_pHub_con); mxb_assert(m_pHub_con);
return refresh_nodes(m_pHub_con);
}
bool ClustrixMonitor::refresh_nodes(MYSQL* pHub_con)
{
mxb_assert(pHub_con);
map<int, ClustrixMembership> memberships; map<int, ClustrixMembership> memberships;
if (check_cluster_membership(&memberships)) bool refreshed = check_cluster_membership(pHub_con, &memberships);
if (refreshed)
{ {
const char ZQUERY[] = const char ZQUERY[] =
"SELECT ni.nodeid, ni.iface_ip, ni.mysql_port, ni.healthmon_port, sn.nodeid " "SELECT ni.nodeid, ni.iface_ip, ni.mysql_port, ni.healthmon_port, sn.nodeid "
"FROM system.nodeinfo AS ni " "FROM system.nodeinfo AS ni "
"LEFT JOIN system.softfailed_nodes AS sn ON ni.nodeid = sn.nodeid"; "LEFT JOIN system.softfailed_nodes AS sn ON ni.nodeid = sn.nodeid";
if (mysql_query(m_pHub_con, ZQUERY) == 0) if (mysql_query(pHub_con, ZQUERY) == 0)
{ {
MYSQL_RES* pResult = mysql_store_result(m_pHub_con); MYSQL_RES* pResult = mysql_store_result(pHub_con);
if (pResult) if (pResult)
{ {
mxb_assert(mysql_field_count(m_pHub_con) == 5); mxb_assert(mysql_field_count(pHub_con) == 5);
set<int> nids; set<int> nids;
for (const auto& element : m_nodes) for (const auto& element : m_nodes)
@ -404,25 +507,7 @@ void ClustrixMonitor::refresh_nodes()
ClustrixNode& node = nit->second; ClustrixNode& node = nit->second;
bool changed = false; node.update(ip, mysql_port, health_port);
if (node.ip() != ip)
{
node.set_ip(ip);
changed = true;
}
if (node.mysql_port() != mysql_port)
{
node.set_mysql_port(mysql_port);
changed = true;
}
if (node.health_port() != health_port)
{
node.set_health_port(health_port);
changed = true;
}
bool is_draining = node.server()->is_draining(); bool is_draining = node.server()->is_draining();
@ -443,11 +528,6 @@ void ClustrixMonitor::refresh_nodes()
node.server()->clear_status(SERVER_DRAINING); node.server()->clear_status(SERVER_DRAINING);
} }
if (changed)
{
persist_node(node);
}
nids.erase(id); nids.erase(id);
} }
else if (mit != memberships.end()) else if (mit != memberships.end())
@ -473,11 +553,10 @@ void ClustrixMonitor::refresh_nodes()
const ClustrixMembership& membership = mit->second; const ClustrixMembership& membership = mit->second;
int health_check_threshold = m_config.health_check_threshold(); int health_check_threshold = m_config.health_check_threshold();
ClustrixNode node(membership, ip, mysql_port, health_port, ClustrixNode node(this, membership, ip, mysql_port, health_port,
health_check_threshold, pServer); health_check_threshold, pServer);
m_nodes.insert(make_pair(id, node)); m_nodes.insert(make_pair(id, node));
persist_node(node);
// New server, so it needs to be added to all services that // New server, so it needs to be added to all services that
// use this monitor for defining its cluster of servers. // use this monitor for defining its cluster of servers.
@ -517,7 +596,6 @@ void ClustrixMonitor::refresh_nodes()
ClustrixNode& node = it->second; ClustrixNode& node = it->second;
node.set_running(false, ClustrixNode::APPROACH_OVERRIDE); node.set_running(false, ClustrixNode::APPROACH_OVERRIDE);
unpersist_node(node);
} }
vector<string> health_urls; vector<string> health_urls;
@ -536,15 +614,17 @@ void ClustrixMonitor::refresh_nodes()
else else
{ {
MXS_WARNING("%s: No result returned for '%s' on %s.", MXS_WARNING("%s: No result returned for '%s' on %s.",
name(), ZQUERY, m_pHub_server->address); name(), ZQUERY, mysql_get_host_info(pHub_con));
} }
} }
else else
{ {
MXS_ERROR("%s: Could not execute '%s' on %s: %s", MXS_ERROR("%s: Could not execute '%s' on %s: %s",
name(), ZQUERY, m_pHub_server->address, mysql_error(m_pHub_con)); name(), ZQUERY, mysql_get_host_info(pHub_con), mysql_error(pHub_con));
} }
} }
return refreshed;
} }
void ClustrixMonitor::check_cluster(Clustrix::Softfailed softfailed) void ClustrixMonitor::check_cluster(Clustrix::Softfailed softfailed)
@ -578,24 +658,23 @@ void ClustrixMonitor::check_hub(Clustrix::Softfailed softfailed)
} }
} }
bool ClustrixMonitor::check_cluster_membership(std::map<int, ClustrixMembership>* pMemberships) bool ClustrixMonitor::check_cluster_membership(MYSQL* pHub_con,
std::map<int, ClustrixMembership>* pMemberships)
{ {
mxb_assert(pHub_con);
mxb_assert(pMemberships); mxb_assert(pMemberships);
mxb_assert(m_pHub_con);
mxb_assert(m_pHub_server);
bool rv = false; bool rv = false;
const char ZQUERY[] = "SELECT nid, status, instance, substate FROM system.membership"; const char ZQUERY[] = "SELECT nid, status, instance, substate FROM system.membership";
if (mysql_query(m_pHub_con, ZQUERY) == 0) if (mysql_query(pHub_con, ZQUERY) == 0)
{ {
MYSQL_RES* pResult = mysql_store_result(m_pHub_con); MYSQL_RES* pResult = mysql_store_result(pHub_con);
if (pResult) if (pResult)
{ {
mxb_assert(mysql_field_count(m_pHub_con) == 4); mxb_assert(mysql_field_count(pHub_con) == 4);
set<int> nids; set<int> nids;
for (const auto& element : m_nodes) for (const auto& element : m_nodes)
@ -666,7 +745,7 @@ bool ClustrixMonitor::check_cluster_membership(std::map<int, ClustrixMembership>
else else
{ {
MXS_ERROR("%s: Could not execute '%s' on %s: %s", MXS_ERROR("%s: Could not execute '%s' on %s: %s",
name(), ZQUERY, m_pHub_server->address, mysql_error(m_pHub_con)); name(), ZQUERY, mysql_get_host_info(pHub_con), mysql_error(pHub_con));
} }
return rv; return rv;
@ -894,7 +973,7 @@ bool ClustrixMonitor::perform_operation(Operation operation,
return performed; return performed;
} }
void ClustrixMonitor::persist_node(const ClustrixNode& node) void ClustrixMonitor::persist(const ClustrixNode& node)
{ {
if (!m_pDb) if (!m_pDb)
{ {
@ -923,7 +1002,7 @@ void ClustrixMonitor::persist_node(const ClustrixNode& node)
} }
} }
void ClustrixMonitor::unpersist_node(const ClustrixNode& node) void ClustrixMonitor::unpersist(const ClustrixNode& node)
{ {
if (!m_pDb) if (!m_pDb)
{ {

View File

@ -21,7 +21,8 @@
#include "clustrixmembership.hh" #include "clustrixmembership.hh"
#include "clustrixnode.hh" #include "clustrixnode.hh"
class ClustrixMonitor : public maxscale::MonitorWorker class ClustrixMonitor : public maxscale::MonitorWorker,
private ClustrixNode::Persister
{ {
ClustrixMonitor(const ClustrixMonitor&) = delete; ClustrixMonitor(const ClustrixMonitor&) = delete;
ClustrixMonitor& operator=(const ClustrixMonitor&) = delete; ClustrixMonitor& operator=(const ClustrixMonitor&) = delete;
@ -91,10 +92,12 @@ private:
bool choose_dynamic_hub(Clustrix::Softfailed softfailed, std::set<std::string>& ips_checked); bool choose_dynamic_hub(Clustrix::Softfailed softfailed, std::set<std::string>& ips_checked);
bool choose_bootstrap_hub(Clustrix::Softfailed softfailed, std::set<std::string>& ips_checked); bool choose_bootstrap_hub(Clustrix::Softfailed softfailed, std::set<std::string>& ips_checked);
bool choose_persisted_hub(Clustrix::Softfailed softfailed, std::set<std::string>& ips_checked); bool refresh_using_persisted_nodes(std::set<std::string>& ips_checked);
void refresh_nodes(); bool refresh_nodes();
bool check_cluster_membership(std::map<int, ClustrixMembership>* pMemberships); bool refresh_nodes(MYSQL* pHub_con);
bool check_cluster_membership(MYSQL* pHub_con,
std::map<int, ClustrixMembership>* pMemberships);
void update_server_statuses(); void update_server_statuses();
@ -136,8 +139,9 @@ private:
return mxb::WorkerLoad::get_time_ms(); return mxb::WorkerLoad::get_time_ms();
} }
void persist_node(const ClustrixNode& node); // ClustrixNode::Persister
void unpersist_node(const ClustrixNode& node); void persist(const ClustrixNode& node);
void unpersist(const ClustrixNode& node);
private: private:
Config m_config; Config m_config;

View File

@ -22,6 +22,13 @@
class ClustrixNode class ClustrixNode
{ {
public: public:
class Persister
{
public:
virtual void persist(const ClustrixNode& node) = 0;
virtual void unpersist(const ClustrixNode& node) = 0;
};
enum enum
{ {
DEFAULT_MYSQL_PORT = 3306, DEFAULT_MYSQL_PORT = 3306,
@ -34,13 +41,15 @@ public:
APPROACH_DEFAULT APPROACH_DEFAULT
}; };
ClustrixNode(const ClustrixMembership& membership, ClustrixNode(Persister* pPersister,
const ClustrixMembership& membership,
const std::string& ip, const std::string& ip,
int mysql_port, int mysql_port,
int health_port, int health_port,
int health_check_threshold, int health_check_threshold,
SERVER* pServer) SERVER* pServer)
: m_id(membership.id()) : m_persister(*pPersister)
, m_id(membership.id())
, m_status(membership.status()) , m_status(membership.status())
, m_substate(membership.substate()) , m_substate(membership.substate())
, m_instance(membership.instance()) , m_instance(membership.instance())
@ -52,6 +61,8 @@ public:
, m_pServer(pServer) , m_pServer(pServer)
, m_pCon(nullptr) , m_pCon(nullptr)
{ {
m_pServer->set_status(SERVER_MASTER | SERVER_RUNNING);
m_persister.persist(*this);
} }
~ClustrixNode() ~ClustrixNode()
@ -87,33 +98,16 @@ public:
return m_ip; return m_ip;
} }
void set_ip(const std::string& ip)
{
m_ip = ip;
m_pServer->server_update_address(ip);
}
int mysql_port() const int mysql_port() const
{ {
return m_mysql_port; return m_mysql_port;
} }
void set_mysql_port(int port)
{
m_mysql_port = port;
m_pServer->update_port(port);
}
int health_port() const int health_port() const
{ {
return m_health_port; return m_health_port;
} }
void set_health_port(int port)
{
m_health_port = port;
}
bool is_running() const bool is_running() const
{ {
return m_nRunning > 0; return m_nRunning > 0;
@ -123,9 +117,13 @@ public:
{ {
if (running) if (running)
{ {
m_nRunning = m_health_check_threshold; if (m_nRunning == 0)
{
m_pServer->set_status(SERVER_MASTER | SERVER_RUNNING);
m_persister.persist(*this);
}
m_pServer->set_status(SERVER_MASTER | SERVER_RUNNING); m_nRunning = m_health_check_threshold;
} }
else else
{ {
@ -143,11 +141,43 @@ public:
if (m_nRunning == 0) if (m_nRunning == 0)
{ {
m_pServer->clear_status(SERVER_MASTER | SERVER_RUNNING); m_pServer->clear_status(SERVER_MASTER | SERVER_RUNNING);
m_persister.unpersist(*this);
} }
} }
} }
} }
void update(const std::string& ip,
int mysql_port,
int health_port)
{
bool changed = false;
if (ip != m_ip)
{
m_ip = ip;
changed = true;
}
if (mysql_port != m_mysql_port)
{
m_mysql_port = mysql_port;
m_pServer->update_port(m_mysql_port);
changed = true;
}
if (health_port != m_health_port)
{
m_health_port = health_port;
changed = true;
}
if (changed)
{
m_persister.persist(*this);
}
}
void update(Clustrix::Status status, Clustrix::SubState substate, int instance) void update(Clustrix::Status status, Clustrix::SubState substate, int instance)
{ {
m_status = status; m_status = status;
@ -158,6 +188,7 @@ public:
void deactivate_server() void deactivate_server()
{ {
m_pServer->is_active = false; m_pServer->is_active = false;
m_persister.unpersist(*this);
} }
bool can_be_used_as_hub(const char* zName, bool can_be_used_as_hub(const char* zName,
@ -194,6 +225,7 @@ public:
} }
private: private:
Persister& m_persister;
int m_id; int m_id;
Clustrix::Status m_status; Clustrix::Status m_status;
Clustrix::SubState m_substate; Clustrix::SubState m_substate;