MXS-2168 Add 'assume_unique_hostnames'-setting to MariaDBMonitor
Adds the setting and takes it into use during replication graph creation and the most important checks.
This commit is contained in:
parent
64a9a5135e
commit
1a046bd453
@ -160,8 +160,13 @@ void MariaDBMonitor::tarjan_scc_visit_node(MariaDBServer* node,
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Use slave status and server id information to build the replication graph. Needs to be called whenever
|
||||
* topology has changed, or it's suspected.
|
||||
*/
|
||||
void MariaDBMonitor::build_replication_graph()
|
||||
{
|
||||
const bool use_hostnames = m_assume_unique_hostnames;
|
||||
// First, reset all node data.
|
||||
for (MariaDBServer* server : m_servers)
|
||||
{
|
||||
@ -169,39 +174,64 @@ void MariaDBMonitor::build_replication_graph()
|
||||
server->m_node.reset_results();
|
||||
}
|
||||
|
||||
/* Here, all slave connections are added to the graph, even if the IO thread cannot connect. Strictly
|
||||
* speaking, building the parents-array is not required as the data already exists. This construction
|
||||
* is more for convenience and faster access later on. */
|
||||
for (MariaDBServer* slave : m_servers)
|
||||
{
|
||||
/* All servers are accepted in this loop, even if the server is [Down] or [Maintenance]. For these
|
||||
* servers, we just use the latest available information. Not adding such servers could suddenly
|
||||
* change the topology quite a bit and all it would take is a momentarily network failure. */
|
||||
|
||||
/* Check all slave connections of all servers. Connections are added even if one or both endpoints
|
||||
* are down or in maintenance. */
|
||||
for (SlaveStatus& slave_conn : slave->m_slave_status)
|
||||
{
|
||||
/* We always trust the "Master_Server_Id"-field of the SHOW SLAVE STATUS output, as long as
|
||||
* the id is > 0 (server uses 0 for default). This means that the graph constructed is faulty if
|
||||
* an old "Master_Server_Id"- value is read from a slave which is still trying to connect to
|
||||
* a new master. However, a server is only designated [Slave] if both IO- and SQL-threads are
|
||||
* running fine, so the faulty graph does not cause wrong status settings. */
|
||||
|
||||
/* IF THIS PART IS CHANGED, CHANGE THE COMPARISON IN 'sstatus_arrays_topology_equal'
|
||||
* (in MariaDBServer) accordingly so that any possible topology changes are detected. */
|
||||
auto master_id = slave_conn.master_server_id;
|
||||
if (slave_conn.slave_io_running != SlaveStatus::SLAVE_IO_NO && master_id > 0)
|
||||
if (slave_conn.slave_io_running != SlaveStatus::SLAVE_IO_NO && slave_conn.slave_sql_running)
|
||||
{
|
||||
// Valid slave connection, find the MariaDBServer with this id.
|
||||
auto master = get_server(master_id);
|
||||
if (master != NULL)
|
||||
// Looks promising, check hostname or server id.
|
||||
MariaDBServer* found_master = NULL;
|
||||
bool is_external = false;
|
||||
if (use_hostnames)
|
||||
{
|
||||
slave->m_node.parents.push_back(master);
|
||||
master->m_node.children.push_back(slave);
|
||||
found_master = get_server(slave_conn.master_host, slave_conn.master_port);
|
||||
if (!found_master)
|
||||
{
|
||||
// Must be an external server.
|
||||
is_external = true;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Cannot trust hostname:port since network may be complicated. Instead,
|
||||
* trust the "Master_Server_Id"-field of the SHOW SLAVE STATUS output if
|
||||
* the slave connection has been seen connected before. This means that
|
||||
* the graph will miss slave-master relations that have not connected
|
||||
* while the monitor has been running. TODO: This data should be saved so
|
||||
* that monitor restarts do not lose this information. */
|
||||
if (slave_conn.seen_connected)
|
||||
{
|
||||
// Valid slave connection, find the MariaDBServer with the matching server id.
|
||||
found_master = get_server(slave_conn.master_server_id);
|
||||
if (!found_master)
|
||||
{
|
||||
/* Likely an external master. It's possible that the master is a monitored
|
||||
* server which has not been queried yet and the monitor does not know its
|
||||
* id. */
|
||||
is_external = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Valid slave connection, find the MariaDBServer with this id.
|
||||
if (found_master)
|
||||
{
|
||||
/* Building the parents-array is not strictly required as the same data is in
|
||||
* the children-array. This construction is more for convenience and faster
|
||||
* access later on. */
|
||||
slave->m_node.parents.push_back(found_master);
|
||||
found_master->m_node.children.push_back(slave);
|
||||
}
|
||||
else if (is_external)
|
||||
{
|
||||
// This is an external master connection. Save just the master id for now.
|
||||
slave->m_node.external_masters.push_back(master_id);
|
||||
// TODO: Save host:port instead
|
||||
slave->m_node.external_masters.push_back(slave_conn.master_server_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -47,6 +47,7 @@ static const char CN_FAILOVER_TIMEOUT[] = "failover_timeout";
|
||||
static const char CN_SWITCHOVER_TIMEOUT[] = "switchover_timeout";
|
||||
static const char CN_DETECT_STANDALONE_MASTER[] = "detect_standalone_master";
|
||||
static const char CN_MAINTENANCE_ON_LOW_DISK_SPACE[] = "maintenance_on_low_disk_space";
|
||||
static const char CN_ASSUME_UNIQUE_HOSTNAMES[] = "assume_unique_hostnames";
|
||||
// Parameters for master failure verification and timeout
|
||||
static const char CN_VERIFY_MASTER_FAILURE[] = "verify_master_failure";
|
||||
static const char CN_MASTER_FAILURE_TIMEOUT[] = "master_failure_timeout";
|
||||
@ -78,7 +79,7 @@ void MariaDBMonitor::reset_server_info()
|
||||
// Next, initialize the data.
|
||||
for (auto mon_server = m_monitor->monitored_servers; mon_server; mon_server = mon_server->next)
|
||||
{
|
||||
m_servers.push_back(new MariaDBServer(mon_server, m_servers.size()));
|
||||
m_servers.push_back(new MariaDBServer(mon_server, m_servers.size(), m_assume_unique_hostnames));
|
||||
}
|
||||
for (auto iter = m_servers.begin(); iter != m_servers.end(); iter++)
|
||||
{
|
||||
@ -151,6 +152,22 @@ MariaDBServer* MariaDBMonitor::get_server(SERVER* server)
|
||||
return found;
|
||||
}
|
||||
|
||||
MariaDBServer* MariaDBMonitor::get_server(const std::string& host, int port)
|
||||
{
|
||||
// TODO: Do this with a map lookup
|
||||
MariaDBServer* found = NULL;
|
||||
for (MariaDBServer* server : m_servers)
|
||||
{
|
||||
SERVER* srv = server->m_server_base->server;
|
||||
if (host == srv->address && srv->port == port)
|
||||
{
|
||||
found = server;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return found;
|
||||
}
|
||||
|
||||
bool MariaDBMonitor::set_replication_credentials(const MXS_CONFIG_PARAMETER* params)
|
||||
{
|
||||
bool rval = false;
|
||||
@ -197,6 +214,7 @@ bool MariaDBMonitor::configure(const MXS_CONFIG_PARAMETER* params)
|
||||
m_detect_stale_slave = config_get_bool(params, "detect_stale_slave");
|
||||
m_ignore_external_masters = config_get_bool(params, "ignore_external_masters");
|
||||
m_detect_standalone_master = config_get_bool(params, CN_DETECT_STANDALONE_MASTER);
|
||||
m_assume_unique_hostnames = config_get_bool(params, CN_ASSUME_UNIQUE_HOSTNAMES);
|
||||
m_failcount = config_get_integer(params, CN_FAILCOUNT);
|
||||
m_failover_timeout = config_get_integer(params, CN_FAILOVER_TIMEOUT);
|
||||
m_switchover_timeout = config_get_integer(params, CN_SWITCHOVER_TIMEOUT);
|
||||
@ -230,6 +248,25 @@ bool MariaDBMonitor::configure(const MXS_CONFIG_PARAMETER* params)
|
||||
MXS_ERROR("Both '%s' and '%s' must be defined", CN_REPLICATION_USER, CN_REPLICATION_PASSWORD);
|
||||
settings_ok = false;
|
||||
}
|
||||
if (!m_assume_unique_hostnames)
|
||||
{
|
||||
const char requires[] = "%s requires that %s is on.";
|
||||
if (m_auto_failover)
|
||||
{
|
||||
MXB_ERROR(requires, CN_AUTO_FAILOVER, CN_ASSUME_UNIQUE_HOSTNAMES);
|
||||
settings_ok = false;
|
||||
}
|
||||
if (m_switchover_on_low_disk_space)
|
||||
{
|
||||
MXB_ERROR(requires, CN_SWITCHOVER_ON_LOW_DISK_SPACE, CN_ASSUME_UNIQUE_HOSTNAMES);
|
||||
settings_ok = false;
|
||||
}
|
||||
if (m_auto_rejoin)
|
||||
{
|
||||
MXB_ERROR(requires, CN_AUTO_REJOIN, CN_ASSUME_UNIQUE_HOSTNAMES);
|
||||
settings_ok = false;
|
||||
}
|
||||
}
|
||||
return settings_ok;
|
||||
}
|
||||
|
||||
@ -1084,6 +1121,9 @@ extern "C" MXS_MODULE* MXS_CREATE_MODULE()
|
||||
{
|
||||
CN_HANDLE_EVENTS, MXS_MODULE_PARAM_BOOL, "true"
|
||||
},
|
||||
{
|
||||
CN_ASSUME_UNIQUE_HOSTNAMES, MXS_MODULE_PARAM_BOOL, "true"
|
||||
},
|
||||
{MXS_END_MODULE_PARAMS}
|
||||
}
|
||||
};
|
||||
|
@ -200,6 +200,7 @@ private:
|
||||
* TODO: think about removing */
|
||||
bool m_ignore_external_masters = false; /* Ignore masters outside of the monitor configuration.
|
||||
* TODO: requires work */
|
||||
bool m_assume_unique_hostnames = true; /* Are server hostnames consistent between MaxScale and servers */
|
||||
int m_failcount = 1; /* Number of ticks master must be down before it's considered
|
||||
* totally down, allowing failover or master change. */
|
||||
|
||||
@ -253,6 +254,7 @@ private:
|
||||
MariaDBServer* get_server_info(MXS_MONITORED_SERVER* db);
|
||||
MariaDBServer* get_server(int64_t id);
|
||||
MariaDBServer* get_server(SERVER* server);
|
||||
MariaDBServer* get_server(const std::string& host, int port);
|
||||
|
||||
// Cluster discovery and status assignment methods, top levels
|
||||
void update_server(MariaDBServer* server);
|
||||
|
@ -35,9 +35,11 @@ public:
|
||||
std::string status;
|
||||
};
|
||||
|
||||
MariaDBServer::MariaDBServer(MXS_MONITORED_SERVER* monitored_server, int config_index)
|
||||
MariaDBServer::MariaDBServer(MXS_MONITORED_SERVER* monitored_server, int config_index,
|
||||
bool assume_unique_hostnames)
|
||||
: m_server_base(monitored_server)
|
||||
, m_config_index(config_index)
|
||||
, m_assume_unique_hostnames(assume_unique_hostnames)
|
||||
{
|
||||
mxb_assert(monitored_server);
|
||||
}
|
||||
@ -951,8 +953,9 @@ void MariaDBServer::set_status(uint64_t bits)
|
||||
|
||||
/**
|
||||
* Compare if the given slave status array is equal to the one stored in the MariaDBServer.
|
||||
* Only compares the parts relevant for building replication topology: master server id:s and
|
||||
* slave connection io states.
|
||||
* Only compares the parts relevant for building replication topology: slave IO/SQL state,
|
||||
* host:port and master server id:s. When unsure, return false. This must match
|
||||
* 'build_replication_graph()' in the monitor class.
|
||||
*
|
||||
* @param new_slave_status Right hand side
|
||||
* @return True if equal
|
||||
@ -969,10 +972,14 @@ bool MariaDBServer::sstatus_array_topology_equal(const SlaveStatusArray& new_sla
|
||||
{
|
||||
for (size_t i = 0; i < old_slave_status.size(); i++)
|
||||
{
|
||||
// It's enough to check just the following two items, as these are used in
|
||||
// 'build_replication_graph'.
|
||||
if (old_slave_status[i].slave_io_running != new_slave_status[i].slave_io_running
|
||||
|| old_slave_status[i].master_server_id != new_slave_status[i].master_server_id)
|
||||
const auto new_row = new_slave_status[i];
|
||||
const auto old_row = old_slave_status[i];
|
||||
// Strictly speaking, the following should depend on the 'assume_unique_hostnames',
|
||||
// but the situations it would make a difference are so rare they can be ignored.
|
||||
if (new_row.slave_io_running != old_row.slave_io_running
|
||||
|| new_row.slave_sql_running != old_row.slave_sql_running
|
||||
|| new_row.master_host != old_row.master_host || new_row.master_port != old_row.master_port
|
||||
|| new_row.master_server_id != old_row.master_server_id)
|
||||
{
|
||||
rval = false;
|
||||
break;
|
||||
@ -1144,19 +1151,40 @@ bool MariaDBServer::can_be_promoted(OperationType op, const MariaDBServer* demot
|
||||
const SlaveStatus* MariaDBServer::slave_connection_status(const MariaDBServer* target) const
|
||||
{
|
||||
// The slave node may have several slave connections, need to find the one that is
|
||||
// connected to the parent. This section is quite similar to the one in
|
||||
// 'build_replication_graph', although here we require that the sql thread is running.
|
||||
auto target_id = target->m_server_id;
|
||||
// connected to the parent. TODO: Use the information gathered in 'build_replication_graph'
|
||||
// to skip this function, as the contents are very similar.
|
||||
const SlaveStatus* rval = NULL;
|
||||
for (const SlaveStatus& ss : m_slave_status)
|
||||
if (m_assume_unique_hostnames)
|
||||
{
|
||||
auto master_id = ss.master_server_id;
|
||||
// Should this check 'Master_Host' and 'Master_Port' instead of server id:s?
|
||||
if (master_id > 0 && master_id == target_id && ss.slave_sql_running && ss.seen_connected
|
||||
&& ss.slave_io_running != SlaveStatus::SLAVE_IO_NO)
|
||||
// Can simply compare host:port.
|
||||
SERVER* target_srv = target->m_server_base->server;
|
||||
string target_host = target_srv->address;
|
||||
int target_port = target_srv->port;
|
||||
for (const SlaveStatus& ss : m_slave_status)
|
||||
{
|
||||
rval = &ss;
|
||||
break;
|
||||
if (ss.master_host == target_host && ss.master_port == target_port &&
|
||||
ss.slave_sql_running && ss.slave_io_running != SlaveStatus::SLAVE_IO_NO)
|
||||
{
|
||||
rval = &ss;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Compare server id:s instead. If the master's id is wrong (e.g. never updated) this gives a
|
||||
// wrong result. Also gives wrong result if monitor has never seen the slave connection in the
|
||||
// connected state.
|
||||
auto target_id = target->m_server_id;
|
||||
for (const SlaveStatus& ss : m_slave_status)
|
||||
{
|
||||
auto master_id = ss.master_server_id;
|
||||
if (master_id > 0 && master_id == target_id && ss.slave_sql_running && ss.seen_connected
|
||||
&& ss.slave_io_running != SlaveStatus::SLAVE_IO_NO)
|
||||
{
|
||||
rval = &ss;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return rval;
|
||||
|
@ -119,6 +119,8 @@ public:
|
||||
/* Replication lag of the server. Used during calculation so that the actual SERVER struct is
|
||||
* only written to once. */
|
||||
int m_replication_lag = MXS_RLAG_UNDEFINED;
|
||||
/* Copy of same field in monitor object. TODO: pass in struct when adding concurrent updating. */
|
||||
bool m_assume_unique_hostnames = true;
|
||||
/* Has anything that could affect replication topology changed this iteration?
|
||||
* Causes: server id, slave connections, read-only. */
|
||||
bool m_topology_changed = true;
|
||||
@ -128,7 +130,8 @@ public:
|
||||
|
||||
bool m_print_update_errormsg = true; /* Should an update error be printed? */
|
||||
|
||||
MariaDBServer(MXS_MONITORED_SERVER* monitored_server, int config_index);
|
||||
MariaDBServer(MXS_MONITORED_SERVER* monitored_server, int config_index,
|
||||
bool assume_unique_hostnames = true);
|
||||
|
||||
/**
|
||||
* Print server information to a json object.
|
||||
|
Loading…
x
Reference in New Issue
Block a user