MXS-2168 Add 'assume_unique_hostnames'-setting to MariaDBMonitor

Adds the setting and takes it into use during replication graph creation
and the most important checks.
This commit is contained in:
Esa Korhonen 2018-11-16 10:19:49 +02:00
parent 64a9a5135e
commit 1a046bd453
5 changed files with 143 additions and 40 deletions

View File

@ -160,8 +160,13 @@ void MariaDBMonitor::tarjan_scc_visit_node(MariaDBServer* node,
}
}
/**
* Use slave status and server id information to build the replication graph. Needs to be called whenever
* topology has changed, or it's suspected.
*/
void MariaDBMonitor::build_replication_graph()
{
const bool use_hostnames = m_assume_unique_hostnames;
// First, reset all node data.
for (MariaDBServer* server : m_servers)
{
@ -169,39 +174,64 @@ void MariaDBMonitor::build_replication_graph()
server->m_node.reset_results();
}
/* Here, all slave connections are added to the graph, even if the IO thread cannot connect. Strictly
* speaking, building the parents-array is not required as the data already exists. This construction
* is more for convenience and faster access later on. */
for (MariaDBServer* slave : m_servers)
{
/* All servers are accepted in this loop, even if the server is [Down] or [Maintenance]. For these
* servers, we just use the latest available information. Not adding such servers could suddenly
* change the topology quite a bit and all it would take is a momentarily network failure. */
/* Check all slave connections of all servers. Connections are added even if one or both endpoints
* are down or in maintenance. */
for (SlaveStatus& slave_conn : slave->m_slave_status)
{
/* We always trust the "Master_Server_Id"-field of the SHOW SLAVE STATUS output, as long as
* the id is > 0 (server uses 0 for default). This means that the graph constructed is faulty if
* an old "Master_Server_Id"- value is read from a slave which is still trying to connect to
* a new master. However, a server is only designated [Slave] if both IO- and SQL-threads are
* running fine, so the faulty graph does not cause wrong status settings. */
/* IF THIS PART IS CHANGED, CHANGE THE COMPARISON IN 'sstatus_arrays_topology_equal'
* (in MariaDBServer) accordingly so that any possible topology changes are detected. */
auto master_id = slave_conn.master_server_id;
if (slave_conn.slave_io_running != SlaveStatus::SLAVE_IO_NO && master_id > 0)
if (slave_conn.slave_io_running != SlaveStatus::SLAVE_IO_NO && slave_conn.slave_sql_running)
{
// Valid slave connection, find the MariaDBServer with this id.
auto master = get_server(master_id);
if (master != NULL)
// Looks promising, check hostname or server id.
MariaDBServer* found_master = NULL;
bool is_external = false;
if (use_hostnames)
{
slave->m_node.parents.push_back(master);
master->m_node.children.push_back(slave);
found_master = get_server(slave_conn.master_host, slave_conn.master_port);
if (!found_master)
{
// Must be an external server.
is_external = true;
}
}
else
{
/* Cannot trust hostname:port since network may be complicated. Instead,
* trust the "Master_Server_Id"-field of the SHOW SLAVE STATUS output if
* the slave connection has been seen connected before. This means that
* the graph will miss slave-master relations that have not connected
* while the monitor has been running. TODO: This data should be saved so
* that monitor restarts do not lose this information. */
if (slave_conn.seen_connected)
{
// Valid slave connection, find the MariaDBServer with the matching server id.
found_master = get_server(slave_conn.master_server_id);
if (!found_master)
{
/* Likely an external master. It's possible that the master is a monitored
* server which has not been queried yet and the monitor does not know its
* id. */
is_external = true;
}
}
}
// Valid slave connection, find the MariaDBServer with this id.
if (found_master)
{
/* Building the parents-array is not strictly required as the same data is in
* the children-array. This construction is more for convenience and faster
* access later on. */
slave->m_node.parents.push_back(found_master);
found_master->m_node.children.push_back(slave);
}
else if (is_external)
{
// This is an external master connection. Save just the master id for now.
slave->m_node.external_masters.push_back(master_id);
// TODO: Save host:port instead
slave->m_node.external_masters.push_back(slave_conn.master_server_id);
}
}
}

View File

@ -47,6 +47,7 @@ static const char CN_FAILOVER_TIMEOUT[] = "failover_timeout";
static const char CN_SWITCHOVER_TIMEOUT[] = "switchover_timeout";
static const char CN_DETECT_STANDALONE_MASTER[] = "detect_standalone_master";
static const char CN_MAINTENANCE_ON_LOW_DISK_SPACE[] = "maintenance_on_low_disk_space";
static const char CN_ASSUME_UNIQUE_HOSTNAMES[] = "assume_unique_hostnames";
// Parameters for master failure verification and timeout
static const char CN_VERIFY_MASTER_FAILURE[] = "verify_master_failure";
static const char CN_MASTER_FAILURE_TIMEOUT[] = "master_failure_timeout";
@ -78,7 +79,7 @@ void MariaDBMonitor::reset_server_info()
// Next, initialize the data.
for (auto mon_server = m_monitor->monitored_servers; mon_server; mon_server = mon_server->next)
{
m_servers.push_back(new MariaDBServer(mon_server, m_servers.size()));
m_servers.push_back(new MariaDBServer(mon_server, m_servers.size(), m_assume_unique_hostnames));
}
for (auto iter = m_servers.begin(); iter != m_servers.end(); iter++)
{
@ -151,6 +152,22 @@ MariaDBServer* MariaDBMonitor::get_server(SERVER* server)
return found;
}
MariaDBServer* MariaDBMonitor::get_server(const std::string& host, int port)
{
// TODO: Do this with a map lookup
MariaDBServer* found = NULL;
for (MariaDBServer* server : m_servers)
{
SERVER* srv = server->m_server_base->server;
if (host == srv->address && srv->port == port)
{
found = server;
break;
}
}
return found;
}
bool MariaDBMonitor::set_replication_credentials(const MXS_CONFIG_PARAMETER* params)
{
bool rval = false;
@ -197,6 +214,7 @@ bool MariaDBMonitor::configure(const MXS_CONFIG_PARAMETER* params)
m_detect_stale_slave = config_get_bool(params, "detect_stale_slave");
m_ignore_external_masters = config_get_bool(params, "ignore_external_masters");
m_detect_standalone_master = config_get_bool(params, CN_DETECT_STANDALONE_MASTER);
m_assume_unique_hostnames = config_get_bool(params, CN_ASSUME_UNIQUE_HOSTNAMES);
m_failcount = config_get_integer(params, CN_FAILCOUNT);
m_failover_timeout = config_get_integer(params, CN_FAILOVER_TIMEOUT);
m_switchover_timeout = config_get_integer(params, CN_SWITCHOVER_TIMEOUT);
@ -230,6 +248,25 @@ bool MariaDBMonitor::configure(const MXS_CONFIG_PARAMETER* params)
MXS_ERROR("Both '%s' and '%s' must be defined", CN_REPLICATION_USER, CN_REPLICATION_PASSWORD);
settings_ok = false;
}
if (!m_assume_unique_hostnames)
{
const char requires[] = "%s requires that %s is on.";
if (m_auto_failover)
{
MXB_ERROR(requires, CN_AUTO_FAILOVER, CN_ASSUME_UNIQUE_HOSTNAMES);
settings_ok = false;
}
if (m_switchover_on_low_disk_space)
{
MXB_ERROR(requires, CN_SWITCHOVER_ON_LOW_DISK_SPACE, CN_ASSUME_UNIQUE_HOSTNAMES);
settings_ok = false;
}
if (m_auto_rejoin)
{
MXB_ERROR(requires, CN_AUTO_REJOIN, CN_ASSUME_UNIQUE_HOSTNAMES);
settings_ok = false;
}
}
return settings_ok;
}
@ -1084,6 +1121,9 @@ extern "C" MXS_MODULE* MXS_CREATE_MODULE()
{
CN_HANDLE_EVENTS, MXS_MODULE_PARAM_BOOL, "true"
},
{
CN_ASSUME_UNIQUE_HOSTNAMES, MXS_MODULE_PARAM_BOOL, "true"
},
{MXS_END_MODULE_PARAMS}
}
};

View File

@ -200,6 +200,7 @@ private:
* TODO: think about removing */
bool m_ignore_external_masters = false; /* Ignore masters outside of the monitor configuration.
* TODO: requires work */
bool m_assume_unique_hostnames = true; /* Are server hostnames consistent between MaxScale and servers */
int m_failcount = 1; /* Number of ticks master must be down before it's considered
* totally down, allowing failover or master change. */
@ -253,6 +254,7 @@ private:
MariaDBServer* get_server_info(MXS_MONITORED_SERVER* db);
MariaDBServer* get_server(int64_t id);
MariaDBServer* get_server(SERVER* server);
MariaDBServer* get_server(const std::string& host, int port);
// Cluster discovery and status assignment methods, top levels
void update_server(MariaDBServer* server);

View File

@ -35,9 +35,11 @@ public:
std::string status;
};
MariaDBServer::MariaDBServer(MXS_MONITORED_SERVER* monitored_server, int config_index)
MariaDBServer::MariaDBServer(MXS_MONITORED_SERVER* monitored_server, int config_index,
bool assume_unique_hostnames)
: m_server_base(monitored_server)
, m_config_index(config_index)
, m_assume_unique_hostnames(assume_unique_hostnames)
{
mxb_assert(monitored_server);
}
@ -951,8 +953,9 @@ void MariaDBServer::set_status(uint64_t bits)
/**
* Compare if the given slave status array is equal to the one stored in the MariaDBServer.
* Only compares the parts relevant for building replication topology: master server id:s and
* slave connection io states.
* Only compares the parts relevant for building replication topology: slave IO/SQL state,
* host:port and master server id:s. When unsure, return false. This must match
* 'build_replication_graph()' in the monitor class.
*
* @param new_slave_status Right hand side
* @return True if equal
@ -969,10 +972,14 @@ bool MariaDBServer::sstatus_array_topology_equal(const SlaveStatusArray& new_sla
{
for (size_t i = 0; i < old_slave_status.size(); i++)
{
// It's enough to check just the following two items, as these are used in
// 'build_replication_graph'.
if (old_slave_status[i].slave_io_running != new_slave_status[i].slave_io_running
|| old_slave_status[i].master_server_id != new_slave_status[i].master_server_id)
const auto new_row = new_slave_status[i];
const auto old_row = old_slave_status[i];
// Strictly speaking, the following should depend on the 'assume_unique_hostnames',
// but the situations it would make a difference are so rare they can be ignored.
if (new_row.slave_io_running != old_row.slave_io_running
|| new_row.slave_sql_running != old_row.slave_sql_running
|| new_row.master_host != old_row.master_host || new_row.master_port != old_row.master_port
|| new_row.master_server_id != old_row.master_server_id)
{
rval = false;
break;
@ -1144,19 +1151,40 @@ bool MariaDBServer::can_be_promoted(OperationType op, const MariaDBServer* demot
const SlaveStatus* MariaDBServer::slave_connection_status(const MariaDBServer* target) const
{
// The slave node may have several slave connections, need to find the one that is
// connected to the parent. This section is quite similar to the one in
// 'build_replication_graph', although here we require that the sql thread is running.
auto target_id = target->m_server_id;
// connected to the parent. TODO: Use the information gathered in 'build_replication_graph'
// to skip this function, as the contents are very similar.
const SlaveStatus* rval = NULL;
for (const SlaveStatus& ss : m_slave_status)
if (m_assume_unique_hostnames)
{
auto master_id = ss.master_server_id;
// Should this check 'Master_Host' and 'Master_Port' instead of server id:s?
if (master_id > 0 && master_id == target_id && ss.slave_sql_running && ss.seen_connected
&& ss.slave_io_running != SlaveStatus::SLAVE_IO_NO)
// Can simply compare host:port.
SERVER* target_srv = target->m_server_base->server;
string target_host = target_srv->address;
int target_port = target_srv->port;
for (const SlaveStatus& ss : m_slave_status)
{
rval = &ss;
break;
if (ss.master_host == target_host && ss.master_port == target_port &&
ss.slave_sql_running && ss.slave_io_running != SlaveStatus::SLAVE_IO_NO)
{
rval = &ss;
break;
}
}
}
else
{
// Compare server id:s instead. If the master's id is wrong (e.g. never updated) this gives a
// wrong result. Also gives wrong result if monitor has never seen the slave connection in the
// connected state.
auto target_id = target->m_server_id;
for (const SlaveStatus& ss : m_slave_status)
{
auto master_id = ss.master_server_id;
if (master_id > 0 && master_id == target_id && ss.slave_sql_running && ss.seen_connected
&& ss.slave_io_running != SlaveStatus::SLAVE_IO_NO)
{
rval = &ss;
break;
}
}
}
return rval;

View File

@ -119,6 +119,8 @@ public:
/* Replication lag of the server. Used during calculation so that the actual SERVER struct is
* only written to once. */
int m_replication_lag = MXS_RLAG_UNDEFINED;
/* Copy of same field in monitor object. TODO: pass in struct when adding concurrent updating. */
bool m_assume_unique_hostnames = true;
/* Has anything that could affect replication topology changed this iteration?
* Causes: server id, slave connections, read-only. */
bool m_topology_changed = true;
@ -128,7 +130,8 @@ public:
bool m_print_update_errormsg = true; /* Should an update error be printed? */
MariaDBServer(MXS_MONITORED_SERVER* monitored_server, int config_index);
MariaDBServer(MXS_MONITORED_SERVER* monitored_server, int config_index,
bool assume_unique_hostnames = true);
/**
* Print server information to a json object.