MXS-1845 Only select a master if current master is no longer usable
The purpose is to make the selected master server sticky. The master is reselected only if the current master is no longer a valid master.
This commit is contained in:
parent
95935991ab
commit
2f987d0b10
@ -14,8 +14,10 @@
|
||||
#include "mariadbmon.hh"
|
||||
#include <inttypes.h>
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
#include <maxscale/modutil.h>
|
||||
#include <maxscale/mysql_utils.h>
|
||||
#include <algorithm>
|
||||
|
||||
using std::string;
|
||||
|
||||
@ -57,6 +59,11 @@ void topology_DFS(MariaDBServer* node, T* data, void (*visit_func)(MariaDBServer
|
||||
}
|
||||
}
|
||||
|
||||
static bool server_config_compare(const MariaDBServer* lhs, const MariaDBServer* rhs)
|
||||
{
|
||||
return lhs->m_config_index < rhs->m_config_index;
|
||||
}
|
||||
|
||||
/**
|
||||
* This function computes the replication tree from a set of monitored servers and returns the root server
|
||||
* with SERVER_MASTER bit. The tree is computed even for servers in 'maintenance' mode.
|
||||
@ -330,6 +337,8 @@ void MariaDBMonitor::tarjan_scc_visit_node(MariaDBServer *node, ServerArray* sta
|
||||
cycle_node.cycle = cycle_ind;
|
||||
ServerArray& members = m_cycles[cycle_ind]; // Creates array if didn't exist
|
||||
members.push_back(cycle_server);
|
||||
// Sort the cycle members according to monitor config order.
|
||||
std::sort(members.begin(), members.end(), server_config_compare);
|
||||
// All cycle elements popped. Next cycle...
|
||||
*next_cycle = cycle_ind + 1;
|
||||
}
|
||||
@ -998,24 +1007,31 @@ MariaDBServer* MariaDBMonitor::find_topology_master_server()
|
||||
// For each cycle, it's enough to take one sample server, as all members of a cycle have the same reach.
|
||||
for (auto iter = m_cycles.begin(); iter != m_cycles.end(); iter++)
|
||||
{
|
||||
ServerArray& cycle_members = m_cycles[(*iter).first];
|
||||
MariaDBServer* sample_server = find_master_inside_cycle(cycle_members);
|
||||
if (sample_server)
|
||||
int cycle_id = iter->first;
|
||||
ServerArray& cycle_members = m_cycles[cycle_id];
|
||||
// Check that no server in the cycle is replicating from outside the cycle. This requirement is
|
||||
// analogous with the same requirement for non-cycle servers.
|
||||
if (!cycle_has_master_server(cycle_members))
|
||||
{
|
||||
master_candidates.push_back(sample_server);
|
||||
}
|
||||
else
|
||||
{
|
||||
// No single server in the cycle was viable.
|
||||
const char WARN_MSG[] = "No valid master server could be found in the cycle with servers '%s'.";
|
||||
string server_names = monitored_servers_to_string(cycle_members);
|
||||
MXS_WARNING(WARN_MSG, server_names.c_str());
|
||||
|
||||
for (auto iter2 = cycle_members.begin(); iter2 != cycle_members.end(); iter2++)
|
||||
MariaDBServer* sample_server = find_master_inside_cycle(cycle_members);
|
||||
if (sample_server)
|
||||
{
|
||||
MariaDBServer* disqualified_server = *iter2;
|
||||
string reasons = disqualify_reasons_to_string(disqualified_server);
|
||||
MXS_WARNING(SERVER_DISQUALIFIED, disqualified_server->name(), reasons.c_str());
|
||||
master_candidates.push_back(sample_server);
|
||||
}
|
||||
else
|
||||
{
|
||||
// No single server in the cycle was viable.
|
||||
const char WARN_MSG[] = "No valid master server could be found in the cycle with "
|
||||
"servers '%s'.";
|
||||
string server_names = monitored_servers_to_string(cycle_members);
|
||||
MXS_WARNING(WARN_MSG, server_names.c_str());
|
||||
|
||||
for (auto iter2 = cycle_members.begin(); iter2 != cycle_members.end(); iter2++)
|
||||
{
|
||||
MariaDBServer* disqualified_server = *iter2;
|
||||
string reasons = disqualify_reasons_to_string(disqualified_server);
|
||||
MXS_WARNING(SERVER_DISQUALIFIED, disqualified_server->name(), reasons.c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1203,3 +1219,98 @@ void MariaDBMonitor::assign_slave_and_relay_master(MariaDBServer* node)
|
||||
node->set_status(SERVER_RELAY_MASTER);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Should a new master server be selected?
|
||||
*
|
||||
* @param reason_out Output for a text description
|
||||
* @return True, if the current master has changed in a way that a new master should be selected.
|
||||
*/
|
||||
bool MariaDBMonitor::master_no_longer_valid(std::string* reason_out)
|
||||
{
|
||||
// The master server of the cluster needs to be re-calculated in the following four cases:
|
||||
bool rval = false;
|
||||
// 1) There is no master.
|
||||
if (m_master == NULL)
|
||||
{
|
||||
rval = true;
|
||||
}
|
||||
// 2) read_only has been activated on the master.
|
||||
else if (m_master->is_read_only())
|
||||
{
|
||||
rval = true;
|
||||
*reason_out = "it is in read-only mode";
|
||||
}
|
||||
// 3) The master was a non-replicating master (not in a cycle) but now has a slave connection.
|
||||
else if (m_master_cycle_status.cycle_id == NodeData::CYCLE_NONE)
|
||||
{
|
||||
// The master should not have a master of its own.
|
||||
if (!m_master->m_node.parents.empty())
|
||||
{
|
||||
rval = true;
|
||||
*reason_out = "it has started replicating from another server in the cluster.";
|
||||
}
|
||||
}
|
||||
// 4) The master was part of a cycle but is no longer, or one of the servers in the cycle is
|
||||
// replicating from a server outside the cycle.
|
||||
else
|
||||
{
|
||||
/* The master was previously in a cycle. Compare the current cycle to the previous data and see
|
||||
* if the cycle is still the best multimaster group. */
|
||||
int current_cycle_id = m_master->m_node.cycle;
|
||||
|
||||
// 4a) The master is no longer in a cycle.
|
||||
if (current_cycle_id == NodeData::CYCLE_NONE)
|
||||
{
|
||||
rval = true;
|
||||
ServerArray& old_members = m_master_cycle_status.cycle_members;
|
||||
string server_names_old = monitored_servers_to_string(old_members);
|
||||
*reason_out = "it is no longer in the multimaster group (" + server_names_old + ").";
|
||||
}
|
||||
// 4b) The master is still in a cycle but the cycle has gained a master outside of the cycle.
|
||||
else
|
||||
{
|
||||
ServerArray& current_members = m_cycles[current_cycle_id];
|
||||
if (cycle_has_master_server(current_members))
|
||||
{
|
||||
rval = true;
|
||||
string server_names_current = monitored_servers_to_string(current_members);
|
||||
*reason_out = "a server in the master's multimaster group (" + server_names_current +
|
||||
") is replicating from a server not in the group.";
|
||||
}
|
||||
}
|
||||
}
|
||||
return rval;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if any of the servers in the cycle is replicating from a server not in the cycle. External masters
|
||||
* do not count.
|
||||
*
|
||||
* @param cycle The cycle to check
|
||||
* @return True if a server is replicating from a master not in the same cycle
|
||||
*/
|
||||
bool MariaDBMonitor::cycle_has_master_server(ServerArray& cycle_servers)
|
||||
{
|
||||
bool outside_replication = false;
|
||||
int cycle_id = cycle_servers.front()->m_node.cycle;
|
||||
// Looks good, check that no cycle server is replicating from elsewhere.
|
||||
for (auto iter = cycle_servers.begin(); iter != cycle_servers.end() && !outside_replication; iter++)
|
||||
{
|
||||
MariaDBServer* server = *iter;
|
||||
for (auto iter_master = server->m_node.parents.begin();
|
||||
iter_master != server->m_node.parents.end();
|
||||
iter_master++)
|
||||
{
|
||||
if ((*iter_master)->m_node.cycle != cycle_id)
|
||||
{
|
||||
// Cycle member is replicating from a server that is not in the current cycle. The
|
||||
// cycle is not a valid "master" cycle.
|
||||
outside_replication = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return outside_replication;
|
||||
}
|
@ -77,7 +77,7 @@ void MariaDBMonitor::reset_server_info()
|
||||
// Next, initialize the data.
|
||||
for (auto mon_server = m_monitor->monitored_servers; mon_server; mon_server = mon_server->next)
|
||||
{
|
||||
m_servers.push_back(new MariaDBServer(mon_server));
|
||||
m_servers.push_back(new MariaDBServer(mon_server, m_servers.size()));
|
||||
}
|
||||
for (auto iter = m_servers.begin(); iter != m_servers.end(); iter++)
|
||||
{
|
||||
@ -392,20 +392,36 @@ void MariaDBMonitor::tick()
|
||||
|
||||
build_replication_graph();
|
||||
find_graph_cycles();
|
||||
// Use the information to find the so far best master server.
|
||||
MariaDBServer* root_master = find_topology_master_server();
|
||||
if (root_master)
|
||||
string reason;
|
||||
MariaDBServer* root_master = m_master; // TODO: Refactor this out by reducing use of root_master
|
||||
if (master_no_longer_valid(&reason))
|
||||
{
|
||||
MXS_DEBUG("Server '%s' is the best master candidate with %d slaves.",
|
||||
root_master->name(), root_master->m_node.reach);
|
||||
m_master = root_master;
|
||||
if (m_master && !reason.empty())
|
||||
{
|
||||
MXS_WARNING("The previous master server '%s' is no longer a valid master because %s",
|
||||
m_master->name(), reason.c_str());
|
||||
}
|
||||
|
||||
// The current master is no longer ok (or it never was). Find another. Master changes are logged
|
||||
// by the log_master_changes()-method.
|
||||
root_master = find_topology_master_server();
|
||||
if (root_master)
|
||||
{
|
||||
m_master = root_master;
|
||||
// A new master has been set. Save some data regarding the type of the master.
|
||||
int new_cycle_id = m_master->m_node.cycle;
|
||||
m_master_cycle_status.cycle_id = new_cycle_id;
|
||||
if (new_cycle_id == NodeData::CYCLE_NONE)
|
||||
{
|
||||
m_master_cycle_status.cycle_members.clear();
|
||||
}
|
||||
else
|
||||
{
|
||||
m_master_cycle_status.cycle_members = m_cycles[new_cycle_id];
|
||||
}
|
||||
}
|
||||
}
|
||||
#ifdef SS_DEBUG
|
||||
else
|
||||
{
|
||||
MXS_DEBUG("No valid master server found in the cluster.");
|
||||
}
|
||||
#endif
|
||||
|
||||
assign_master_and_slave();
|
||||
|
||||
if (!m_ignore_external_masters)
|
||||
@ -1049,7 +1065,7 @@ string monitored_servers_to_string(const ServerArray& servers)
|
||||
{
|
||||
rval += separator;
|
||||
rval += servers[i]->name();
|
||||
separator = ",";
|
||||
separator = ", ";
|
||||
}
|
||||
}
|
||||
return rval;
|
||||
|
@ -102,10 +102,16 @@ protected:
|
||||
void process_state_changes();
|
||||
|
||||
private:
|
||||
|
||||
struct CycleInfo
|
||||
{
|
||||
int cycle_id = NodeData::CYCLE_NONE;
|
||||
ServerArray cycle_members;
|
||||
};
|
||||
|
||||
unsigned long m_id; /**< Monitor ID */
|
||||
ServerArray m_servers; /**< Servers of the monitor */
|
||||
ServerInfoMap m_server_info; /**< Map from server base struct to MariaDBServer */
|
||||
CycleMap m_cycles; /**< Map from cycle number to cycle member servers */
|
||||
|
||||
// Values updated by monitor
|
||||
MariaDBServer* m_master; /**< Master server for Master/Slave replication */
|
||||
@ -114,6 +120,8 @@ private:
|
||||
std::string m_external_master_host; /**< External master host, for fail/switchover */
|
||||
int m_external_master_port; /**< External master port */
|
||||
bool m_cluster_modified; /**< Has an automatic failover/rejoin been performed this loop? */
|
||||
CycleMap m_cycles; /**< Map from cycle number to cycle member servers */
|
||||
CycleInfo m_master_cycle_status; /**< Info about master server cycle from previous round */
|
||||
|
||||
// Replication topology detection settings
|
||||
bool m_allow_cluster_recovery; /**< Allow failed servers to rejoin the cluster */
|
||||
@ -196,6 +204,8 @@ private:
|
||||
MariaDBServer* find_master_inside_cycle(ServerArray& cycle_servers);
|
||||
void assign_master_and_slave();
|
||||
void assign_slave_and_relay_master(MariaDBServer* node);
|
||||
bool master_no_longer_valid(std::string* reason_out);
|
||||
bool cycle_has_master_server(ServerArray& cycle_servers);
|
||||
|
||||
// Switchover methods
|
||||
bool switchover_check(SERVER* new_master, SERVER* current_master,
|
||||
|
@ -38,8 +38,9 @@ SlaveStatus::SlaveStatus()
|
||||
, slave_sql_running(false)
|
||||
{}
|
||||
|
||||
MariaDBServer::MariaDBServer(MXS_MONITORED_SERVER* monitored_server)
|
||||
MariaDBServer::MariaDBServer(MXS_MONITORED_SERVER* monitored_server, int config_index)
|
||||
: m_server_base(monitored_server)
|
||||
, m_config_index(config_index)
|
||||
, m_print_update_errormsg(true)
|
||||
, m_version(version::UNKNOWN)
|
||||
, m_server_id(SERVER_ID_UNKNOWN)
|
||||
|
@ -131,10 +131,10 @@ public:
|
||||
MXS_MONITORED_SERVER* m_server_base; /**< Monitored server base class/struct. MariaDBServer does not
|
||||
* own the struct, it is not freed (or connection closed) when
|
||||
* a MariaDBServer is destroyed. Can be const on gcc 4.8 */
|
||||
int m_config_index; /**< What position this server has in the monitor config */
|
||||
bool m_print_update_errormsg;/**< Should an update error be printed. */
|
||||
version m_version; /**< Server version/type. */
|
||||
int64_t m_server_id; /**< Value of @@server_id. Valid values are 32bit unsigned. */
|
||||
|
||||
bool m_read_only; /**< Value of @@read_only */
|
||||
size_t m_n_slaves_running; /**< Number of running slave connections */
|
||||
int m_n_slave_heartbeats; /**< Number of received heartbeats */
|
||||
@ -146,9 +146,9 @@ public:
|
||||
GtidList m_gtid_binlog_pos; /**< Gtid of latest event written to binlog. */
|
||||
SlaveStatusArray m_slave_status; /**< Data returned from SHOW SLAVE STATUS */
|
||||
ReplicationSettings m_rpl_settings; /**< Miscellaneous replication related settings */
|
||||
|
||||
NodeData m_node; /**< Replication topology data */
|
||||
MariaDBServer(MXS_MONITORED_SERVER* monitored_server);
|
||||
|
||||
MariaDBServer(MXS_MONITORED_SERVER* monitored_server, int config_index);
|
||||
|
||||
void monitor_server();
|
||||
void update_server_info();
|
||||
|
@ -142,7 +142,7 @@ void MariaDBMonitor::Test::init_servers(int count)
|
||||
base_server->name = MXS_STRDUP(server_name.str().c_str());
|
||||
MXS_MONITORED_SERVER* mon_server = new MXS_MONITORED_SERVER; // Contents mostly undefined
|
||||
mon_server->server = base_server;
|
||||
MariaDBServer* new_server = new MariaDBServer(mon_server);
|
||||
MariaDBServer* new_server = new MariaDBServer(mon_server, i - 1);
|
||||
new_server->m_server_id = i;
|
||||
m_monitor->m_servers.push_back(new_server);
|
||||
m_monitor->m_server_info[mon_server] = new_server;
|
||||
|
Loading…
x
Reference in New Issue
Block a user