MXS-1845 Only select a master if current master is no longer usable
The purpose is to make the selected master server sticky. The master is reselected only if the current master is no longer a valid master.
This commit is contained in:
@ -14,8 +14,10 @@
|
|||||||
#include "mariadbmon.hh"
|
#include "mariadbmon.hh"
|
||||||
#include <inttypes.h>
|
#include <inttypes.h>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <sstream>
|
||||||
#include <maxscale/modutil.h>
|
#include <maxscale/modutil.h>
|
||||||
#include <maxscale/mysql_utils.h>
|
#include <maxscale/mysql_utils.h>
|
||||||
|
#include <algorithm>
|
||||||
|
|
||||||
using std::string;
|
using std::string;
|
||||||
|
|
||||||
@ -57,6 +59,11 @@ void topology_DFS(MariaDBServer* node, T* data, void (*visit_func)(MariaDBServer
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool server_config_compare(const MariaDBServer* lhs, const MariaDBServer* rhs)
|
||||||
|
{
|
||||||
|
return lhs->m_config_index < rhs->m_config_index;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This function computes the replication tree from a set of monitored servers and returns the root server
|
* This function computes the replication tree from a set of monitored servers and returns the root server
|
||||||
* with SERVER_MASTER bit. The tree is computed even for servers in 'maintenance' mode.
|
* with SERVER_MASTER bit. The tree is computed even for servers in 'maintenance' mode.
|
||||||
@ -330,6 +337,8 @@ void MariaDBMonitor::tarjan_scc_visit_node(MariaDBServer *node, ServerArray* sta
|
|||||||
cycle_node.cycle = cycle_ind;
|
cycle_node.cycle = cycle_ind;
|
||||||
ServerArray& members = m_cycles[cycle_ind]; // Creates array if didn't exist
|
ServerArray& members = m_cycles[cycle_ind]; // Creates array if didn't exist
|
||||||
members.push_back(cycle_server);
|
members.push_back(cycle_server);
|
||||||
|
// Sort the cycle members according to monitor config order.
|
||||||
|
std::sort(members.begin(), members.end(), server_config_compare);
|
||||||
// All cycle elements popped. Next cycle...
|
// All cycle elements popped. Next cycle...
|
||||||
*next_cycle = cycle_ind + 1;
|
*next_cycle = cycle_ind + 1;
|
||||||
}
|
}
|
||||||
@ -998,24 +1007,31 @@ MariaDBServer* MariaDBMonitor::find_topology_master_server()
|
|||||||
// For each cycle, it's enough to take one sample server, as all members of a cycle have the same reach.
|
// For each cycle, it's enough to take one sample server, as all members of a cycle have the same reach.
|
||||||
for (auto iter = m_cycles.begin(); iter != m_cycles.end(); iter++)
|
for (auto iter = m_cycles.begin(); iter != m_cycles.end(); iter++)
|
||||||
{
|
{
|
||||||
ServerArray& cycle_members = m_cycles[(*iter).first];
|
int cycle_id = iter->first;
|
||||||
MariaDBServer* sample_server = find_master_inside_cycle(cycle_members);
|
ServerArray& cycle_members = m_cycles[cycle_id];
|
||||||
if (sample_server)
|
// Check that no server in the cycle is replicating from outside the cycle. This requirement is
|
||||||
|
// analogous with the same requirement for non-cycle servers.
|
||||||
|
if (!cycle_has_master_server(cycle_members))
|
||||||
{
|
{
|
||||||
master_candidates.push_back(sample_server);
|
MariaDBServer* sample_server = find_master_inside_cycle(cycle_members);
|
||||||
}
|
if (sample_server)
|
||||||
else
|
|
||||||
{
|
|
||||||
// No single server in the cycle was viable.
|
|
||||||
const char WARN_MSG[] = "No valid master server could be found in the cycle with servers '%s'.";
|
|
||||||
string server_names = monitored_servers_to_string(cycle_members);
|
|
||||||
MXS_WARNING(WARN_MSG, server_names.c_str());
|
|
||||||
|
|
||||||
for (auto iter2 = cycle_members.begin(); iter2 != cycle_members.end(); iter2++)
|
|
||||||
{
|
{
|
||||||
MariaDBServer* disqualified_server = *iter2;
|
master_candidates.push_back(sample_server);
|
||||||
string reasons = disqualify_reasons_to_string(disqualified_server);
|
}
|
||||||
MXS_WARNING(SERVER_DISQUALIFIED, disqualified_server->name(), reasons.c_str());
|
else
|
||||||
|
{
|
||||||
|
// No single server in the cycle was viable.
|
||||||
|
const char WARN_MSG[] = "No valid master server could be found in the cycle with "
|
||||||
|
"servers '%s'.";
|
||||||
|
string server_names = monitored_servers_to_string(cycle_members);
|
||||||
|
MXS_WARNING(WARN_MSG, server_names.c_str());
|
||||||
|
|
||||||
|
for (auto iter2 = cycle_members.begin(); iter2 != cycle_members.end(); iter2++)
|
||||||
|
{
|
||||||
|
MariaDBServer* disqualified_server = *iter2;
|
||||||
|
string reasons = disqualify_reasons_to_string(disqualified_server);
|
||||||
|
MXS_WARNING(SERVER_DISQUALIFIED, disqualified_server->name(), reasons.c_str());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1203,3 +1219,98 @@ void MariaDBMonitor::assign_slave_and_relay_master(MariaDBServer* node)
|
|||||||
node->set_status(SERVER_RELAY_MASTER);
|
node->set_status(SERVER_RELAY_MASTER);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Should a new master server be selected?
|
||||||
|
*
|
||||||
|
* @param reason_out Output for a text description
|
||||||
|
* @return True, if the current master has changed in a way that a new master should be selected.
|
||||||
|
*/
|
||||||
|
bool MariaDBMonitor::master_no_longer_valid(std::string* reason_out)
|
||||||
|
{
|
||||||
|
// The master server of the cluster needs to be re-calculated in the following four cases:
|
||||||
|
bool rval = false;
|
||||||
|
// 1) There is no master.
|
||||||
|
if (m_master == NULL)
|
||||||
|
{
|
||||||
|
rval = true;
|
||||||
|
}
|
||||||
|
// 2) read_only has been activated on the master.
|
||||||
|
else if (m_master->is_read_only())
|
||||||
|
{
|
||||||
|
rval = true;
|
||||||
|
*reason_out = "it is in read-only mode";
|
||||||
|
}
|
||||||
|
// 3) The master was a non-replicating master (not in a cycle) but now has a slave connection.
|
||||||
|
else if (m_master_cycle_status.cycle_id == NodeData::CYCLE_NONE)
|
||||||
|
{
|
||||||
|
// The master should not have a master of its own.
|
||||||
|
if (!m_master->m_node.parents.empty())
|
||||||
|
{
|
||||||
|
rval = true;
|
||||||
|
*reason_out = "it has started replicating from another server in the cluster.";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// 4) The master was part of a cycle but is no longer, or one of the servers in the cycle is
|
||||||
|
// replicating from a server outside the cycle.
|
||||||
|
else
|
||||||
|
{
|
||||||
|
/* The master was previously in a cycle. Compare the current cycle to the previous data and see
|
||||||
|
* if the cycle is still the best multimaster group. */
|
||||||
|
int current_cycle_id = m_master->m_node.cycle;
|
||||||
|
|
||||||
|
// 4a) The master is no longer in a cycle.
|
||||||
|
if (current_cycle_id == NodeData::CYCLE_NONE)
|
||||||
|
{
|
||||||
|
rval = true;
|
||||||
|
ServerArray& old_members = m_master_cycle_status.cycle_members;
|
||||||
|
string server_names_old = monitored_servers_to_string(old_members);
|
||||||
|
*reason_out = "it is no longer in the multimaster group (" + server_names_old + ").";
|
||||||
|
}
|
||||||
|
// 4b) The master is still in a cycle but the cycle has gained a master outside of the cycle.
|
||||||
|
else
|
||||||
|
{
|
||||||
|
ServerArray& current_members = m_cycles[current_cycle_id];
|
||||||
|
if (cycle_has_master_server(current_members))
|
||||||
|
{
|
||||||
|
rval = true;
|
||||||
|
string server_names_current = monitored_servers_to_string(current_members);
|
||||||
|
*reason_out = "a server in the master's multimaster group (" + server_names_current +
|
||||||
|
") is replicating from a server not in the group.";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return rval;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if any of the servers in the cycle is replicating from a server not in the cycle. External masters
|
||||||
|
* do not count.
|
||||||
|
*
|
||||||
|
* @param cycle The cycle to check
|
||||||
|
* @return True if a server is replicating from a master not in the same cycle
|
||||||
|
*/
|
||||||
|
bool MariaDBMonitor::cycle_has_master_server(ServerArray& cycle_servers)
|
||||||
|
{
|
||||||
|
bool outside_replication = false;
|
||||||
|
int cycle_id = cycle_servers.front()->m_node.cycle;
|
||||||
|
// Looks good, check that no cycle server is replicating from elsewhere.
|
||||||
|
for (auto iter = cycle_servers.begin(); iter != cycle_servers.end() && !outside_replication; iter++)
|
||||||
|
{
|
||||||
|
MariaDBServer* server = *iter;
|
||||||
|
for (auto iter_master = server->m_node.parents.begin();
|
||||||
|
iter_master != server->m_node.parents.end();
|
||||||
|
iter_master++)
|
||||||
|
{
|
||||||
|
if ((*iter_master)->m_node.cycle != cycle_id)
|
||||||
|
{
|
||||||
|
// Cycle member is replicating from a server that is not in the current cycle. The
|
||||||
|
// cycle is not a valid "master" cycle.
|
||||||
|
outside_replication = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return outside_replication;
|
||||||
|
}
|
||||||
@ -77,7 +77,7 @@ void MariaDBMonitor::reset_server_info()
|
|||||||
// Next, initialize the data.
|
// Next, initialize the data.
|
||||||
for (auto mon_server = m_monitor->monitored_servers; mon_server; mon_server = mon_server->next)
|
for (auto mon_server = m_monitor->monitored_servers; mon_server; mon_server = mon_server->next)
|
||||||
{
|
{
|
||||||
m_servers.push_back(new MariaDBServer(mon_server));
|
m_servers.push_back(new MariaDBServer(mon_server, m_servers.size()));
|
||||||
}
|
}
|
||||||
for (auto iter = m_servers.begin(); iter != m_servers.end(); iter++)
|
for (auto iter = m_servers.begin(); iter != m_servers.end(); iter++)
|
||||||
{
|
{
|
||||||
@ -392,20 +392,36 @@ void MariaDBMonitor::tick()
|
|||||||
|
|
||||||
build_replication_graph();
|
build_replication_graph();
|
||||||
find_graph_cycles();
|
find_graph_cycles();
|
||||||
// Use the information to find the so far best master server.
|
string reason;
|
||||||
MariaDBServer* root_master = find_topology_master_server();
|
MariaDBServer* root_master = m_master; // TODO: Refactor this out by reducing use of root_master
|
||||||
if (root_master)
|
if (master_no_longer_valid(&reason))
|
||||||
{
|
{
|
||||||
MXS_DEBUG("Server '%s' is the best master candidate with %d slaves.",
|
if (m_master && !reason.empty())
|
||||||
root_master->name(), root_master->m_node.reach);
|
{
|
||||||
m_master = root_master;
|
MXS_WARNING("The previous master server '%s' is no longer a valid master because %s",
|
||||||
|
m_master->name(), reason.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
// The current master is no longer ok (or it never was). Find another. Master changes are logged
|
||||||
|
// by the log_master_changes()-method.
|
||||||
|
root_master = find_topology_master_server();
|
||||||
|
if (root_master)
|
||||||
|
{
|
||||||
|
m_master = root_master;
|
||||||
|
// A new master has been set. Save some data regarding the type of the master.
|
||||||
|
int new_cycle_id = m_master->m_node.cycle;
|
||||||
|
m_master_cycle_status.cycle_id = new_cycle_id;
|
||||||
|
if (new_cycle_id == NodeData::CYCLE_NONE)
|
||||||
|
{
|
||||||
|
m_master_cycle_status.cycle_members.clear();
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
m_master_cycle_status.cycle_members = m_cycles[new_cycle_id];
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
#ifdef SS_DEBUG
|
|
||||||
else
|
|
||||||
{
|
|
||||||
MXS_DEBUG("No valid master server found in the cluster.");
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
assign_master_and_slave();
|
assign_master_and_slave();
|
||||||
|
|
||||||
if (!m_ignore_external_masters)
|
if (!m_ignore_external_masters)
|
||||||
@ -1049,7 +1065,7 @@ string monitored_servers_to_string(const ServerArray& servers)
|
|||||||
{
|
{
|
||||||
rval += separator;
|
rval += separator;
|
||||||
rval += servers[i]->name();
|
rval += servers[i]->name();
|
||||||
separator = ",";
|
separator = ", ";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return rval;
|
return rval;
|
||||||
|
|||||||
@ -102,10 +102,16 @@ protected:
|
|||||||
void process_state_changes();
|
void process_state_changes();
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
||||||
|
struct CycleInfo
|
||||||
|
{
|
||||||
|
int cycle_id = NodeData::CYCLE_NONE;
|
||||||
|
ServerArray cycle_members;
|
||||||
|
};
|
||||||
|
|
||||||
unsigned long m_id; /**< Monitor ID */
|
unsigned long m_id; /**< Monitor ID */
|
||||||
ServerArray m_servers; /**< Servers of the monitor */
|
ServerArray m_servers; /**< Servers of the monitor */
|
||||||
ServerInfoMap m_server_info; /**< Map from server base struct to MariaDBServer */
|
ServerInfoMap m_server_info; /**< Map from server base struct to MariaDBServer */
|
||||||
CycleMap m_cycles; /**< Map from cycle number to cycle member servers */
|
|
||||||
|
|
||||||
// Values updated by monitor
|
// Values updated by monitor
|
||||||
MariaDBServer* m_master; /**< Master server for Master/Slave replication */
|
MariaDBServer* m_master; /**< Master server for Master/Slave replication */
|
||||||
@ -114,6 +120,8 @@ private:
|
|||||||
std::string m_external_master_host; /**< External master host, for fail/switchover */
|
std::string m_external_master_host; /**< External master host, for fail/switchover */
|
||||||
int m_external_master_port; /**< External master port */
|
int m_external_master_port; /**< External master port */
|
||||||
bool m_cluster_modified; /**< Has an automatic failover/rejoin been performed this loop? */
|
bool m_cluster_modified; /**< Has an automatic failover/rejoin been performed this loop? */
|
||||||
|
CycleMap m_cycles; /**< Map from cycle number to cycle member servers */
|
||||||
|
CycleInfo m_master_cycle_status; /**< Info about master server cycle from previous round */
|
||||||
|
|
||||||
// Replication topology detection settings
|
// Replication topology detection settings
|
||||||
bool m_allow_cluster_recovery; /**< Allow failed servers to rejoin the cluster */
|
bool m_allow_cluster_recovery; /**< Allow failed servers to rejoin the cluster */
|
||||||
@ -196,6 +204,8 @@ private:
|
|||||||
MariaDBServer* find_master_inside_cycle(ServerArray& cycle_servers);
|
MariaDBServer* find_master_inside_cycle(ServerArray& cycle_servers);
|
||||||
void assign_master_and_slave();
|
void assign_master_and_slave();
|
||||||
void assign_slave_and_relay_master(MariaDBServer* node);
|
void assign_slave_and_relay_master(MariaDBServer* node);
|
||||||
|
bool master_no_longer_valid(std::string* reason_out);
|
||||||
|
bool cycle_has_master_server(ServerArray& cycle_servers);
|
||||||
|
|
||||||
// Switchover methods
|
// Switchover methods
|
||||||
bool switchover_check(SERVER* new_master, SERVER* current_master,
|
bool switchover_check(SERVER* new_master, SERVER* current_master,
|
||||||
|
|||||||
@ -38,8 +38,9 @@ SlaveStatus::SlaveStatus()
|
|||||||
, slave_sql_running(false)
|
, slave_sql_running(false)
|
||||||
{}
|
{}
|
||||||
|
|
||||||
MariaDBServer::MariaDBServer(MXS_MONITORED_SERVER* monitored_server)
|
MariaDBServer::MariaDBServer(MXS_MONITORED_SERVER* monitored_server, int config_index)
|
||||||
: m_server_base(monitored_server)
|
: m_server_base(monitored_server)
|
||||||
|
, m_config_index(config_index)
|
||||||
, m_print_update_errormsg(true)
|
, m_print_update_errormsg(true)
|
||||||
, m_version(version::UNKNOWN)
|
, m_version(version::UNKNOWN)
|
||||||
, m_server_id(SERVER_ID_UNKNOWN)
|
, m_server_id(SERVER_ID_UNKNOWN)
|
||||||
|
|||||||
@ -131,10 +131,10 @@ public:
|
|||||||
MXS_MONITORED_SERVER* m_server_base; /**< Monitored server base class/struct. MariaDBServer does not
|
MXS_MONITORED_SERVER* m_server_base; /**< Monitored server base class/struct. MariaDBServer does not
|
||||||
* own the struct, it is not freed (or connection closed) when
|
* own the struct, it is not freed (or connection closed) when
|
||||||
* a MariaDBServer is destroyed. Can be const on gcc 4.8 */
|
* a MariaDBServer is destroyed. Can be const on gcc 4.8 */
|
||||||
|
int m_config_index; /**< What position this server has in the monitor config */
|
||||||
bool m_print_update_errormsg;/**< Should an update error be printed. */
|
bool m_print_update_errormsg;/**< Should an update error be printed. */
|
||||||
version m_version; /**< Server version/type. */
|
version m_version; /**< Server version/type. */
|
||||||
int64_t m_server_id; /**< Value of @@server_id. Valid values are 32bit unsigned. */
|
int64_t m_server_id; /**< Value of @@server_id. Valid values are 32bit unsigned. */
|
||||||
|
|
||||||
bool m_read_only; /**< Value of @@read_only */
|
bool m_read_only; /**< Value of @@read_only */
|
||||||
size_t m_n_slaves_running; /**< Number of running slave connections */
|
size_t m_n_slaves_running; /**< Number of running slave connections */
|
||||||
int m_n_slave_heartbeats; /**< Number of received heartbeats */
|
int m_n_slave_heartbeats; /**< Number of received heartbeats */
|
||||||
@ -146,9 +146,9 @@ public:
|
|||||||
GtidList m_gtid_binlog_pos; /**< Gtid of latest event written to binlog. */
|
GtidList m_gtid_binlog_pos; /**< Gtid of latest event written to binlog. */
|
||||||
SlaveStatusArray m_slave_status; /**< Data returned from SHOW SLAVE STATUS */
|
SlaveStatusArray m_slave_status; /**< Data returned from SHOW SLAVE STATUS */
|
||||||
ReplicationSettings m_rpl_settings; /**< Miscellaneous replication related settings */
|
ReplicationSettings m_rpl_settings; /**< Miscellaneous replication related settings */
|
||||||
|
|
||||||
NodeData m_node; /**< Replication topology data */
|
NodeData m_node; /**< Replication topology data */
|
||||||
MariaDBServer(MXS_MONITORED_SERVER* monitored_server);
|
|
||||||
|
MariaDBServer(MXS_MONITORED_SERVER* monitored_server, int config_index);
|
||||||
|
|
||||||
void monitor_server();
|
void monitor_server();
|
||||||
void update_server_info();
|
void update_server_info();
|
||||||
|
|||||||
@ -142,7 +142,7 @@ void MariaDBMonitor::Test::init_servers(int count)
|
|||||||
base_server->name = MXS_STRDUP(server_name.str().c_str());
|
base_server->name = MXS_STRDUP(server_name.str().c_str());
|
||||||
MXS_MONITORED_SERVER* mon_server = new MXS_MONITORED_SERVER; // Contents mostly undefined
|
MXS_MONITORED_SERVER* mon_server = new MXS_MONITORED_SERVER; // Contents mostly undefined
|
||||||
mon_server->server = base_server;
|
mon_server->server = base_server;
|
||||||
MariaDBServer* new_server = new MariaDBServer(mon_server);
|
MariaDBServer* new_server = new MariaDBServer(mon_server, i - 1);
|
||||||
new_server->m_server_id = i;
|
new_server->m_server_id = i;
|
||||||
m_monitor->m_servers.push_back(new_server);
|
m_monitor->m_servers.push_back(new_server);
|
||||||
m_monitor->m_server_info[mon_server] = new_server;
|
m_monitor->m_server_info[mon_server] = new_server;
|
||||||
|
|||||||
Reference in New Issue
Block a user