MXS-2330 Do not use softfailed node as hub
When a softfailed node is finally revoked, it will appear as the single node in a functioning Clustrix cluster. To ensure that the Clustrix monitor will not stick to that node, if the node that is used as hub is softfailed, it is immediately replaced with another node.
This commit is contained in:
parent
35e17d9878
commit
5c34550b40
@ -150,7 +150,54 @@ bool Clustrix::is_part_of_the_quorum(const char* zName, const SERVER& server, MY
|
||||
}
|
||||
else
|
||||
{
|
||||
MXS_ERROR("%s: Could not execute '%s' on %s:%d: %s", zName, zQuery, zAddress, port, mysql_error(pCon));
|
||||
MXS_ERROR("%s: Could not execute '%s' on %s:%d: %s",
|
||||
zName, zQuery, zAddress, port, mysql_error(pCon));
|
||||
}
|
||||
|
||||
return rv;
|
||||
}
|
||||
|
||||
bool Clustrix::is_being_softfailed(const char* zName, const SERVER& server, MYSQL* pCon)
|
||||
{
|
||||
bool rv = false;
|
||||
|
||||
const char* zAddress = server.address;
|
||||
int port = server.port;
|
||||
|
||||
const char ZQUERY_TEMPLATE[] =
|
||||
"SELECT sn.nodeid FROM system.softfailed_nodes AS sn INNER JOIN system.nodeinfo AS ni "
|
||||
"WHERE sn.nodeid = ni.nodeid AND ni.iface_ip = '%s'";
|
||||
|
||||
char zQuery[sizeof(ZQUERY_TEMPLATE) + strlen(zAddress)];
|
||||
|
||||
sprintf(zQuery, ZQUERY_TEMPLATE, zAddress);
|
||||
|
||||
if (mysql_query(pCon, zQuery) == 0)
|
||||
{
|
||||
MYSQL_RES* pResult = mysql_store_result(pCon);
|
||||
|
||||
if (pResult)
|
||||
{
|
||||
mxb_assert(mysql_field_count(pCon) == 1);
|
||||
|
||||
MYSQL_ROW row;
|
||||
while ((row = mysql_fetch_row(pResult)) != nullptr)
|
||||
{
|
||||
// If a row is found, it is because the node is being softfailed.
|
||||
rv = true;
|
||||
}
|
||||
|
||||
mysql_free_result(pResult);
|
||||
}
|
||||
else
|
||||
{
|
||||
MXS_WARNING("%s: No result returned for '%s' on %s:%d.", zName, zQuery, zAddress, port);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
MXS_ERROR("%s: Could not execute '%s' on %s:%d: %s",
|
||||
zName, zQuery, zAddress, port, mysql_error(pCon));
|
||||
}
|
||||
|
||||
return rv;
|
||||
@ -158,6 +205,7 @@ bool Clustrix::is_part_of_the_quorum(const char* zName, const SERVER& server, MY
|
||||
|
||||
bool Clustrix::ping_or_connect_to_hub(const char* zName,
|
||||
const MXS_MONITORED_SERVER::ConnectionSettings& settings,
|
||||
Softfailed softfailed,
|
||||
SERVER& server,
|
||||
MYSQL** ppCon)
|
||||
{
|
||||
@ -168,7 +216,16 @@ bool Clustrix::ping_or_connect_to_hub(const char* zName,
|
||||
{
|
||||
if (Clustrix::is_part_of_the_quorum(zName, server, *ppCon))
|
||||
{
|
||||
connected = true;
|
||||
if ((softfailed == Softfailed::REJECT) && Clustrix::is_being_softfailed(zName, server, *ppCon))
|
||||
{
|
||||
MXS_NOTICE("%s: The Clustrix node %s used as hub is part of the quorum, "
|
||||
"but it is being softfailed. Switching to another node.",
|
||||
zName, server.address);
|
||||
}
|
||||
else
|
||||
{
|
||||
connected = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
|
@ -39,6 +39,12 @@ enum class SubState
|
||||
SubState substate_from_string(const std::string& substate);
|
||||
std::string to_string(SubState sub_state);
|
||||
|
||||
enum class Softfailed
|
||||
{
|
||||
ACCEPT,
|
||||
REJECT
|
||||
};
|
||||
|
||||
/**
|
||||
* Is a particular Clustrix node part of the quorum.
|
||||
*
|
||||
@ -66,15 +72,27 @@ inline bool is_part_of_the_quorum(const char* zName, MXS_MONITORED_SERVER& ms)
|
||||
return is_part_of_the_quorum(zName, *ms.server, ms.con);
|
||||
}
|
||||
|
||||
/**
|
||||
* Is a particular Clustrix node being softfailed.
|
||||
*
|
||||
* @param zName The name of the Clustrix monitor instance.
|
||||
* @param server The server object of a Clustrix node.
|
||||
* @param pCon Valid MYSQL handle to the server.
|
||||
*
|
||||
* @return True, if the node is being softfailed, false otherwise.
|
||||
*/
|
||||
bool is_being_softfailed(const char* zName, const SERVER& server, MYSQL* pCon);
|
||||
|
||||
/**
|
||||
* Ping or create connection to server and check whether it can be used
|
||||
* as hub.
|
||||
*
|
||||
* @param zName The name of the Clustrix monitor instance.
|
||||
* @param settings Connection settings
|
||||
* @param server Server object referring to a Clustrix node.
|
||||
* @param ppCon Address of pointer to MYSQL object referring to @server
|
||||
* (@c *ppCon may also be NULL).
|
||||
* @param zName The name of the Clustrix monitor instance.
|
||||
* @param settings Connection settings
|
||||
* @param softfailed Whether a softfailed node is considered ok or not.
|
||||
* @param server Server object referring to a Clustrix node.
|
||||
* @param ppCon Address of pointer to MYSQL object referring to @server
|
||||
* (@c *ppCon may also be NULL).
|
||||
*
|
||||
* @return True, if the server can be used as hub, false otherwise.
|
||||
*
|
||||
@ -82,6 +100,7 @@ inline bool is_part_of_the_quorum(const char* zName, MXS_MONITORED_SERVER& ms)
|
||||
*/
|
||||
bool ping_or_connect_to_hub(const char* zName,
|
||||
const MXS_MONITORED_SERVER::ConnectionSettings& settings,
|
||||
Softfailed softfailed,
|
||||
SERVER& server,
|
||||
MYSQL** ppCon);
|
||||
|
||||
@ -89,17 +108,19 @@ bool ping_or_connect_to_hub(const char* zName,
|
||||
* Ping or create connection to server and check whether it can be used
|
||||
* as hub.
|
||||
*
|
||||
* @param zName The name of the Clustrix monitor instance.
|
||||
* @param settings Connection settings
|
||||
* @param ms The monitored server.
|
||||
* @param zName The name of the Clustrix monitor instance.
|
||||
* @param settings Connection settings
|
||||
* @param softfailed Whether a softfailed node is considered ok or not.
|
||||
* @param ms The monitored server.
|
||||
*
|
||||
* @return True, if the server can be used as hub, false otherwise.
|
||||
*/
|
||||
inline bool ping_or_connect_to_hub(const char* zName,
|
||||
const MXS_MONITORED_SERVER::ConnectionSettings& settings,
|
||||
Softfailed softfailed,
|
||||
MXS_MONITORED_SERVER& ms)
|
||||
{
|
||||
return ping_or_connect_to_hub(zName, settings, *ms.server, &ms.con);
|
||||
return ping_or_connect_to_hub(zName, settings, softfailed, *ms.server, &ms.con);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -61,7 +61,9 @@ bool ClustrixMonitor::configure(const MXS_CONFIG_PARAMETER* pParams)
|
||||
m_config.set_cluster_monitor_interval(pParams->get_integer(CLUSTER_MONITOR_INTERVAL_NAME));
|
||||
m_config.set_health_check_threshold(pParams->get_integer(HEALTH_CHECK_THRESHOLD_NAME));
|
||||
|
||||
check_hub_and_refresh_nodes();
|
||||
// At startup we accept softfailed nodes in an attempt to be able to
|
||||
// connect at any cost. It'll be replaced once there is an alternative.
|
||||
check_cluster(Clustrix::Softfailed::ACCEPT);
|
||||
|
||||
return true;
|
||||
}
|
||||
@ -149,9 +151,9 @@ void ClustrixMonitor::post_loop()
|
||||
|
||||
void ClustrixMonitor::tick()
|
||||
{
|
||||
if (now() - m_last_cluster_check > m_config.cluster_monitor_interval())
|
||||
if (should_check_cluster())
|
||||
{
|
||||
check_hub_and_refresh_nodes();
|
||||
check_cluster(Clustrix::Softfailed::REJECT);
|
||||
}
|
||||
|
||||
switch (m_http.status())
|
||||
@ -176,7 +178,7 @@ void ClustrixMonitor::tick()
|
||||
}
|
||||
}
|
||||
|
||||
void ClustrixMonitor::choose_hub()
|
||||
void ClustrixMonitor::choose_hub(Clustrix::Softfailed softfailed)
|
||||
{
|
||||
mxb_assert(!m_pHub_con);
|
||||
|
||||
@ -211,7 +213,7 @@ void ClustrixMonitor::choose_hub()
|
||||
|
||||
if (ips.find(ms.server->address) == ips.end())
|
||||
{
|
||||
if (Clustrix::ping_or_connect_to_hub(m_name, m_settings.conn_settings, ms))
|
||||
if (Clustrix::ping_or_connect_to_hub(m_name, m_settings.conn_settings, softfailed, ms))
|
||||
{
|
||||
pHub_con = ms.con;
|
||||
pHub_server = ms.server;
|
||||
@ -253,7 +255,8 @@ void ClustrixMonitor::refresh_nodes()
|
||||
if (check_cluster_membership(&memberships))
|
||||
{
|
||||
const char ZQUERY[] =
|
||||
"SELECT ni.nodeid, ni.iface_ip, ni.mysql_port, ni.healthmon_port, sn.nodeid FROM system.nodeinfo AS ni "
|
||||
"SELECT ni.nodeid, ni.iface_ip, ni.mysql_port, ni.healthmon_port, sn.nodeid "
|
||||
"FROM system.nodeinfo AS ni "
|
||||
"LEFT JOIN system.softfailed_nodes AS sn ON ni.nodeid = sn.nodeid";
|
||||
|
||||
if (mysql_query(m_pHub_con, ZQUERY) == 0)
|
||||
@ -315,14 +318,16 @@ void ClustrixMonitor::refresh_nodes()
|
||||
|
||||
if (softfailed && !is_being_drained)
|
||||
{
|
||||
MXS_NOTICE("%s: Node %d (%s) has been SOFTFAILed. Turning ON 'Being Drained'.",
|
||||
MXS_NOTICE("%s: Node %d (%s) has been SOFTFAILed. "
|
||||
"Turning ON 'Being Drained'.",
|
||||
m_name, node.id(), node.server()->address);
|
||||
|
||||
node.server()->set_status(SERVER_BEING_DRAINED);
|
||||
}
|
||||
else if (!softfailed && is_being_drained)
|
||||
{
|
||||
MXS_NOTICE("%s: Node %d (%s) is no longer being SOFTFAILed. Turning OFF 'Being Drained'.",
|
||||
MXS_NOTICE("%s: Node %d (%s) is no longer being SOFTFAILed. "
|
||||
"Turning OFF 'Being Drained'.",
|
||||
m_name, node.id(), node.server()->address);
|
||||
|
||||
node.server()->clear_status(SERVER_BEING_DRAINED);
|
||||
@ -409,7 +414,7 @@ void ClustrixMonitor::refresh_nodes()
|
||||
|
||||
m_health_urls.swap(health_urls);
|
||||
|
||||
m_last_cluster_check = now();
|
||||
cluster_checked();
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -425,16 +430,16 @@ void ClustrixMonitor::refresh_nodes()
|
||||
}
|
||||
}
|
||||
|
||||
void ClustrixMonitor::check_hub_and_refresh_nodes()
|
||||
void ClustrixMonitor::check_cluster(Clustrix::Softfailed softfailed)
|
||||
{
|
||||
if (m_pHub_con)
|
||||
{
|
||||
check_hub();
|
||||
check_hub(softfailed);
|
||||
}
|
||||
|
||||
if (!m_pHub_con)
|
||||
{
|
||||
choose_hub();
|
||||
choose_hub(softfailed);
|
||||
}
|
||||
|
||||
if (m_pHub_con)
|
||||
@ -443,12 +448,13 @@ void ClustrixMonitor::check_hub_and_refresh_nodes()
|
||||
}
|
||||
}
|
||||
|
||||
void ClustrixMonitor::check_hub()
|
||||
void ClustrixMonitor::check_hub(Clustrix::Softfailed softfailed)
|
||||
{
|
||||
mxb_assert(m_pHub_con);
|
||||
mxb_assert(m_pHub_server);
|
||||
|
||||
if (!Clustrix::ping_or_connect_to_hub(m_name, m_settings.conn_settings, *m_pHub_server, &m_pHub_con))
|
||||
if (!Clustrix::ping_or_connect_to_hub(m_name, m_settings.conn_settings, softfailed,
|
||||
*m_pHub_server, &m_pHub_con))
|
||||
{
|
||||
mysql_close(m_pHub_con);
|
||||
m_pHub_con = nullptr;
|
||||
@ -658,7 +664,7 @@ bool ClustrixMonitor::check_http(Call::action_t action)
|
||||
if (!node.is_running())
|
||||
{
|
||||
// Ok, the node is down. Trigger a cluster check at next tick.
|
||||
m_last_cluster_check = 0;
|
||||
trigger_cluster_check();
|
||||
}
|
||||
}
|
||||
|
||||
@ -677,7 +683,13 @@ bool ClustrixMonitor::check_http(Call::action_t action)
|
||||
|
||||
bool ClustrixMonitor::perform_softfail(SERVER* pServer, json_t** ppError)
|
||||
{
|
||||
return perform_operation(Operation::SOFTFAIL, pServer, ppError);
|
||||
bool rv = perform_operation(Operation::SOFTFAIL, pServer, ppError);
|
||||
|
||||
// Irrespective of whether the operation succeeded or not
|
||||
// a cluster check is triggered at next tick.
|
||||
trigger_cluster_check();
|
||||
|
||||
return rv;
|
||||
}
|
||||
|
||||
bool ClustrixMonitor::perform_unsoftfail(SERVER* pServer, json_t** ppError)
|
||||
@ -698,7 +710,7 @@ bool ClustrixMonitor::perform_operation(Operation operation,
|
||||
|
||||
if (!m_pHub_con)
|
||||
{
|
||||
check_hub_and_refresh_nodes();
|
||||
check_cluster(Clustrix::Softfailed::ACCEPT);
|
||||
}
|
||||
|
||||
if (m_pHub_con)
|
||||
|
@ -81,9 +81,9 @@ private:
|
||||
|
||||
void tick() override;
|
||||
|
||||
void check_hub_and_refresh_nodes();
|
||||
void check_hub();
|
||||
void choose_hub();
|
||||
void check_cluster(Clustrix::Softfailed softfailed);
|
||||
void check_hub(Clustrix::Softfailed softfailed);
|
||||
void choose_hub(Clustrix::Softfailed softfailed);
|
||||
void refresh_nodes();
|
||||
bool check_cluster_membership(std::map<int, ClustrixMembership>* pMemberships);
|
||||
|
||||
@ -107,6 +107,21 @@ private:
|
||||
json_t** ppError);
|
||||
|
||||
|
||||
bool should_check_cluster() const
|
||||
{
|
||||
return now() - m_last_cluster_check > m_config.cluster_monitor_interval();
|
||||
}
|
||||
|
||||
void trigger_cluster_check()
|
||||
{
|
||||
m_last_cluster_check = 0;
|
||||
}
|
||||
|
||||
void cluster_checked()
|
||||
{
|
||||
m_last_cluster_check = now();
|
||||
}
|
||||
|
||||
static long now()
|
||||
{
|
||||
return mxb::WorkerLoad::get_time_ms();
|
||||
|
@ -18,7 +18,8 @@ bool ClustrixNode::can_be_used_as_hub(const char* zName,
|
||||
const MXS_MONITORED_SERVER::ConnectionSettings& settings)
|
||||
{
|
||||
mxb_assert(m_pServer);
|
||||
bool rv = Clustrix::ping_or_connect_to_hub(zName, settings, *m_pServer, &m_pCon);
|
||||
bool rv = Clustrix::ping_or_connect_to_hub(zName, settings, Clustrix::Softfailed::REJECT,
|
||||
*m_pServer, &m_pCon);
|
||||
|
||||
if (!rv)
|
||||
{
|
||||
|
Loading…
x
Reference in New Issue
Block a user