MXS-1845 Only select a master if current master is no longer usable

The purpose is to make the selected master server sticky. The master is reselected only
if the current master is no longer a valid master.
This commit is contained in:
Esa Korhonen 2018-06-12 10:27:23 +03:00
parent 95935991ab
commit 2f987d0b10
6 changed files with 174 additions and 36 deletions

View File

@ -14,8 +14,10 @@
#include "mariadbmon.hh"
#include <inttypes.h>
#include <string>
#include <sstream>
#include <maxscale/modutil.h>
#include <maxscale/mysql_utils.h>
#include <algorithm>
using std::string;
@ -57,6 +59,11 @@ void topology_DFS(MariaDBServer* node, T* data, void (*visit_func)(MariaDBServer
}
}
static bool server_config_compare(const MariaDBServer* lhs, const MariaDBServer* rhs)
{
return lhs->m_config_index < rhs->m_config_index;
}
/**
* This function computes the replication tree from a set of monitored servers and returns the root server
* with SERVER_MASTER bit. The tree is computed even for servers in 'maintenance' mode.
@ -330,6 +337,8 @@ void MariaDBMonitor::tarjan_scc_visit_node(MariaDBServer *node, ServerArray* sta
cycle_node.cycle = cycle_ind;
ServerArray& members = m_cycles[cycle_ind]; // Creates array if didn't exist
members.push_back(cycle_server);
// Sort the cycle members according to monitor config order.
std::sort(members.begin(), members.end(), server_config_compare);
// All cycle elements popped. Next cycle...
*next_cycle = cycle_ind + 1;
}
@ -998,24 +1007,31 @@ MariaDBServer* MariaDBMonitor::find_topology_master_server()
// For each cycle, it's enough to take one sample server, as all members of a cycle have the same reach.
for (auto iter = m_cycles.begin(); iter != m_cycles.end(); iter++)
{
ServerArray& cycle_members = m_cycles[(*iter).first];
MariaDBServer* sample_server = find_master_inside_cycle(cycle_members);
if (sample_server)
int cycle_id = iter->first;
ServerArray& cycle_members = m_cycles[cycle_id];
// Check that no server in the cycle is replicating from outside the cycle. This requirement is
// analogous with the same requirement for non-cycle servers.
if (!cycle_has_master_server(cycle_members))
{
master_candidates.push_back(sample_server);
}
else
{
// No single server in the cycle was viable.
const char WARN_MSG[] = "No valid master server could be found in the cycle with servers '%s'.";
string server_names = monitored_servers_to_string(cycle_members);
MXS_WARNING(WARN_MSG, server_names.c_str());
for (auto iter2 = cycle_members.begin(); iter2 != cycle_members.end(); iter2++)
MariaDBServer* sample_server = find_master_inside_cycle(cycle_members);
if (sample_server)
{
MariaDBServer* disqualified_server = *iter2;
string reasons = disqualify_reasons_to_string(disqualified_server);
MXS_WARNING(SERVER_DISQUALIFIED, disqualified_server->name(), reasons.c_str());
master_candidates.push_back(sample_server);
}
else
{
// No single server in the cycle was viable.
const char WARN_MSG[] = "No valid master server could be found in the cycle with "
"servers '%s'.";
string server_names = monitored_servers_to_string(cycle_members);
MXS_WARNING(WARN_MSG, server_names.c_str());
for (auto iter2 = cycle_members.begin(); iter2 != cycle_members.end(); iter2++)
{
MariaDBServer* disqualified_server = *iter2;
string reasons = disqualify_reasons_to_string(disqualified_server);
MXS_WARNING(SERVER_DISQUALIFIED, disqualified_server->name(), reasons.c_str());
}
}
}
}
@ -1203,3 +1219,98 @@ void MariaDBMonitor::assign_slave_and_relay_master(MariaDBServer* node)
node->set_status(SERVER_RELAY_MASTER);
}
}
/**
* Should a new master server be selected?
*
* @param reason_out Output for a text description
* @return True, if the current master has changed in a way that a new master should be selected.
*/
bool MariaDBMonitor::master_no_longer_valid(std::string* reason_out)
{
// The master server of the cluster needs to be re-calculated in the following four cases:
bool rval = false;
// 1) There is no master.
if (m_master == NULL)
{
rval = true;
}
// 2) read_only has been activated on the master.
else if (m_master->is_read_only())
{
rval = true;
*reason_out = "it is in read-only mode";
}
// 3) The master was a non-replicating master (not in a cycle) but now has a slave connection.
else if (m_master_cycle_status.cycle_id == NodeData::CYCLE_NONE)
{
// The master should not have a master of its own.
if (!m_master->m_node.parents.empty())
{
rval = true;
*reason_out = "it has started replicating from another server in the cluster.";
}
}
// 4) The master was part of a cycle but is no longer, or one of the servers in the cycle is
// replicating from a server outside the cycle.
else
{
/* The master was previously in a cycle. Compare the current cycle to the previous data and see
* if the cycle is still the best multimaster group. */
int current_cycle_id = m_master->m_node.cycle;
// 4a) The master is no longer in a cycle.
if (current_cycle_id == NodeData::CYCLE_NONE)
{
rval = true;
ServerArray& old_members = m_master_cycle_status.cycle_members;
string server_names_old = monitored_servers_to_string(old_members);
*reason_out = "it is no longer in the multimaster group (" + server_names_old + ").";
}
// 4b) The master is still in a cycle but the cycle has gained a master outside of the cycle.
else
{
ServerArray& current_members = m_cycles[current_cycle_id];
if (cycle_has_master_server(current_members))
{
rval = true;
string server_names_current = monitored_servers_to_string(current_members);
*reason_out = "a server in the master's multimaster group (" + server_names_current +
") is replicating from a server not in the group.";
}
}
}
return rval;
}
/**
* Check if any of the servers in the cycle is replicating from a server not in the cycle. External masters
* do not count.
*
* @param cycle The cycle to check
* @return True if a server is replicating from a master not in the same cycle
*/
bool MariaDBMonitor::cycle_has_master_server(ServerArray& cycle_servers)
{
bool outside_replication = false;
int cycle_id = cycle_servers.front()->m_node.cycle;
// Looks good, check that no cycle server is replicating from elsewhere.
for (auto iter = cycle_servers.begin(); iter != cycle_servers.end() && !outside_replication; iter++)
{
MariaDBServer* server = *iter;
for (auto iter_master = server->m_node.parents.begin();
iter_master != server->m_node.parents.end();
iter_master++)
{
if ((*iter_master)->m_node.cycle != cycle_id)
{
// Cycle member is replicating from a server that is not in the current cycle. The
// cycle is not a valid "master" cycle.
outside_replication = true;
break;
}
}
}
return outside_replication;
}

View File

@ -77,7 +77,7 @@ void MariaDBMonitor::reset_server_info()
// Next, initialize the data.
for (auto mon_server = m_monitor->monitored_servers; mon_server; mon_server = mon_server->next)
{
m_servers.push_back(new MariaDBServer(mon_server));
m_servers.push_back(new MariaDBServer(mon_server, m_servers.size()));
}
for (auto iter = m_servers.begin(); iter != m_servers.end(); iter++)
{
@ -392,20 +392,36 @@ void MariaDBMonitor::tick()
build_replication_graph();
find_graph_cycles();
// Use the information to find the so far best master server.
MariaDBServer* root_master = find_topology_master_server();
if (root_master)
string reason;
MariaDBServer* root_master = m_master; // TODO: Refactor this out by reducing use of root_master
if (master_no_longer_valid(&reason))
{
MXS_DEBUG("Server '%s' is the best master candidate with %d slaves.",
root_master->name(), root_master->m_node.reach);
m_master = root_master;
if (m_master && !reason.empty())
{
MXS_WARNING("The previous master server '%s' is no longer a valid master because %s",
m_master->name(), reason.c_str());
}
// The current master is no longer ok (or it never was). Find another. Master changes are logged
// by the log_master_changes()-method.
root_master = find_topology_master_server();
if (root_master)
{
m_master = root_master;
// A new master has been set. Save some data regarding the type of the master.
int new_cycle_id = m_master->m_node.cycle;
m_master_cycle_status.cycle_id = new_cycle_id;
if (new_cycle_id == NodeData::CYCLE_NONE)
{
m_master_cycle_status.cycle_members.clear();
}
else
{
m_master_cycle_status.cycle_members = m_cycles[new_cycle_id];
}
}
}
#ifdef SS_DEBUG
else
{
MXS_DEBUG("No valid master server found in the cluster.");
}
#endif
assign_master_and_slave();
if (!m_ignore_external_masters)
@ -1049,7 +1065,7 @@ string monitored_servers_to_string(const ServerArray& servers)
{
rval += separator;
rval += servers[i]->name();
separator = ",";
separator = ", ";
}
}
return rval;

View File

@ -102,10 +102,16 @@ protected:
void process_state_changes();
private:
struct CycleInfo
{
int cycle_id = NodeData::CYCLE_NONE;
ServerArray cycle_members;
};
unsigned long m_id; /**< Monitor ID */
ServerArray m_servers; /**< Servers of the monitor */
ServerInfoMap m_server_info; /**< Map from server base struct to MariaDBServer */
CycleMap m_cycles; /**< Map from cycle number to cycle member servers */
// Values updated by monitor
MariaDBServer* m_master; /**< Master server for Master/Slave replication */
@ -114,6 +120,8 @@ private:
std::string m_external_master_host; /**< External master host, for fail/switchover */
int m_external_master_port; /**< External master port */
bool m_cluster_modified; /**< Has an automatic failover/rejoin been performed this loop? */
CycleMap m_cycles; /**< Map from cycle number to cycle member servers */
CycleInfo m_master_cycle_status; /**< Info about master server cycle from previous round */
// Replication topology detection settings
bool m_allow_cluster_recovery; /**< Allow failed servers to rejoin the cluster */
@ -196,6 +204,8 @@ private:
MariaDBServer* find_master_inside_cycle(ServerArray& cycle_servers);
void assign_master_and_slave();
void assign_slave_and_relay_master(MariaDBServer* node);
bool master_no_longer_valid(std::string* reason_out);
bool cycle_has_master_server(ServerArray& cycle_servers);
// Switchover methods
bool switchover_check(SERVER* new_master, SERVER* current_master,

View File

@ -38,8 +38,9 @@ SlaveStatus::SlaveStatus()
, slave_sql_running(false)
{}
MariaDBServer::MariaDBServer(MXS_MONITORED_SERVER* monitored_server)
MariaDBServer::MariaDBServer(MXS_MONITORED_SERVER* monitored_server, int config_index)
: m_server_base(monitored_server)
, m_config_index(config_index)
, m_print_update_errormsg(true)
, m_version(version::UNKNOWN)
, m_server_id(SERVER_ID_UNKNOWN)

View File

@ -131,10 +131,10 @@ public:
MXS_MONITORED_SERVER* m_server_base; /**< Monitored server base class/struct. MariaDBServer does not
* own the struct, it is not freed (or connection closed) when
* a MariaDBServer is destroyed. Can be const on gcc 4.8 */
int m_config_index; /**< What position this server has in the monitor config */
bool m_print_update_errormsg;/**< Should an update error be printed. */
version m_version; /**< Server version/type. */
int64_t m_server_id; /**< Value of @@server_id. Valid values are 32bit unsigned. */
bool m_read_only; /**< Value of @@read_only */
size_t m_n_slaves_running; /**< Number of running slave connections */
int m_n_slave_heartbeats; /**< Number of received heartbeats */
@ -146,9 +146,9 @@ public:
GtidList m_gtid_binlog_pos; /**< Gtid of latest event written to binlog. */
SlaveStatusArray m_slave_status; /**< Data returned from SHOW SLAVE STATUS */
ReplicationSettings m_rpl_settings; /**< Miscellaneous replication related settings */
NodeData m_node; /**< Replication topology data */
MariaDBServer(MXS_MONITORED_SERVER* monitored_server);
MariaDBServer(MXS_MONITORED_SERVER* monitored_server, int config_index);
void monitor_server();
void update_server_info();

View File

@ -142,7 +142,7 @@ void MariaDBMonitor::Test::init_servers(int count)
base_server->name = MXS_STRDUP(server_name.str().c_str());
MXS_MONITORED_SERVER* mon_server = new MXS_MONITORED_SERVER; // Contents mostly undefined
mon_server->server = base_server;
MariaDBServer* new_server = new MariaDBServer(mon_server);
MariaDBServer* new_server = new MariaDBServer(mon_server, i - 1);
new_server->m_server_id = i;
m_monitor->m_servers.push_back(new_server);
m_monitor->m_server_info[mon_server] = new_server;