MXS-2168 Add 'assume_unique_hostnames'-setting to MariaDBMonitor

Adds the setting and takes it into use during replication graph creation
and the most important checks.
This commit is contained in:
Esa Korhonen
2018-11-16 10:19:49 +02:00
parent 64a9a5135e
commit 1a046bd453
5 changed files with 143 additions and 40 deletions

View File

@ -160,8 +160,13 @@ void MariaDBMonitor::tarjan_scc_visit_node(MariaDBServer* node,
} }
} }
/**
* Use slave status and server id information to build the replication graph. Needs to be called whenever
* topology has changed, or it's suspected.
*/
void MariaDBMonitor::build_replication_graph() void MariaDBMonitor::build_replication_graph()
{ {
const bool use_hostnames = m_assume_unique_hostnames;
// First, reset all node data. // First, reset all node data.
for (MariaDBServer* server : m_servers) for (MariaDBServer* server : m_servers)
{ {
@ -169,39 +174,64 @@ void MariaDBMonitor::build_replication_graph()
server->m_node.reset_results(); server->m_node.reset_results();
} }
/* Here, all slave connections are added to the graph, even if the IO thread cannot connect. Strictly
* speaking, building the parents-array is not required as the data already exists. This construction
* is more for convenience and faster access later on. */
for (MariaDBServer* slave : m_servers) for (MariaDBServer* slave : m_servers)
{ {
/* All servers are accepted in this loop, even if the server is [Down] or [Maintenance]. For these /* Check all slave connections of all servers. Connections are added even if one or both endpoints
* servers, we just use the latest available information. Not adding such servers could suddenly * are down or in maintenance. */
* change the topology quite a bit and all it would take is a momentarily network failure. */
for (SlaveStatus& slave_conn : slave->m_slave_status) for (SlaveStatus& slave_conn : slave->m_slave_status)
{ {
/* We always trust the "Master_Server_Id"-field of the SHOW SLAVE STATUS output, as long as
* the id is > 0 (server uses 0 for default). This means that the graph constructed is faulty if
* an old "Master_Server_Id"- value is read from a slave which is still trying to connect to
* a new master. However, a server is only designated [Slave] if both IO- and SQL-threads are
* running fine, so the faulty graph does not cause wrong status settings. */
/* IF THIS PART IS CHANGED, CHANGE THE COMPARISON IN 'sstatus_arrays_topology_equal' /* IF THIS PART IS CHANGED, CHANGE THE COMPARISON IN 'sstatus_arrays_topology_equal'
* (in MariaDBServer) accordingly so that any possible topology changes are detected. */ * (in MariaDBServer) accordingly so that any possible topology changes are detected. */
auto master_id = slave_conn.master_server_id; if (slave_conn.slave_io_running != SlaveStatus::SLAVE_IO_NO && slave_conn.slave_sql_running)
if (slave_conn.slave_io_running != SlaveStatus::SLAVE_IO_NO && master_id > 0)
{ {
// Valid slave connection, find the MariaDBServer with this id. // Looks promising, check hostname or server id.
auto master = get_server(master_id); MariaDBServer* found_master = NULL;
if (master != NULL) bool is_external = false;
if (use_hostnames)
{ {
slave->m_node.parents.push_back(master); found_master = get_server(slave_conn.master_host, slave_conn.master_port);
master->m_node.children.push_back(slave); if (!found_master)
{
// Must be an external server.
is_external = true;
}
} }
else else
{
/* Cannot trust hostname:port since network may be complicated. Instead,
* trust the "Master_Server_Id"-field of the SHOW SLAVE STATUS output if
* the slave connection has been seen connected before. This means that
* the graph will miss slave-master relations that have not connected
* while the monitor has been running. TODO: This data should be saved so
* that monitor restarts do not lose this information. */
if (slave_conn.seen_connected)
{
// Valid slave connection, find the MariaDBServer with the matching server id.
found_master = get_server(slave_conn.master_server_id);
if (!found_master)
{
/* Likely an external master. It's possible that the master is a monitored
* server which has not been queried yet and the monitor does not know its
* id. */
is_external = true;
}
}
}
// Valid slave connection, find the MariaDBServer with this id.
if (found_master)
{
/* Building the parents-array is not strictly required as the same data is in
* the children-array. This construction is more for convenience and faster
* access later on. */
slave->m_node.parents.push_back(found_master);
found_master->m_node.children.push_back(slave);
}
else if (is_external)
{ {
// This is an external master connection. Save just the master id for now. // This is an external master connection. Save just the master id for now.
slave->m_node.external_masters.push_back(master_id); // TODO: Save host:port instead
slave->m_node.external_masters.push_back(slave_conn.master_server_id);
} }
} }
} }

View File

@ -47,6 +47,7 @@ static const char CN_FAILOVER_TIMEOUT[] = "failover_timeout";
static const char CN_SWITCHOVER_TIMEOUT[] = "switchover_timeout"; static const char CN_SWITCHOVER_TIMEOUT[] = "switchover_timeout";
static const char CN_DETECT_STANDALONE_MASTER[] = "detect_standalone_master"; static const char CN_DETECT_STANDALONE_MASTER[] = "detect_standalone_master";
static const char CN_MAINTENANCE_ON_LOW_DISK_SPACE[] = "maintenance_on_low_disk_space"; static const char CN_MAINTENANCE_ON_LOW_DISK_SPACE[] = "maintenance_on_low_disk_space";
static const char CN_ASSUME_UNIQUE_HOSTNAMES[] = "assume_unique_hostnames";
// Parameters for master failure verification and timeout // Parameters for master failure verification and timeout
static const char CN_VERIFY_MASTER_FAILURE[] = "verify_master_failure"; static const char CN_VERIFY_MASTER_FAILURE[] = "verify_master_failure";
static const char CN_MASTER_FAILURE_TIMEOUT[] = "master_failure_timeout"; static const char CN_MASTER_FAILURE_TIMEOUT[] = "master_failure_timeout";
@ -78,7 +79,7 @@ void MariaDBMonitor::reset_server_info()
// Next, initialize the data. // Next, initialize the data.
for (auto mon_server = m_monitor->monitored_servers; mon_server; mon_server = mon_server->next) for (auto mon_server = m_monitor->monitored_servers; mon_server; mon_server = mon_server->next)
{ {
m_servers.push_back(new MariaDBServer(mon_server, m_servers.size())); m_servers.push_back(new MariaDBServer(mon_server, m_servers.size(), m_assume_unique_hostnames));
} }
for (auto iter = m_servers.begin(); iter != m_servers.end(); iter++) for (auto iter = m_servers.begin(); iter != m_servers.end(); iter++)
{ {
@ -151,6 +152,22 @@ MariaDBServer* MariaDBMonitor::get_server(SERVER* server)
return found; return found;
} }
MariaDBServer* MariaDBMonitor::get_server(const std::string& host, int port)
{
// TODO: Do this with a map lookup
MariaDBServer* found = NULL;
for (MariaDBServer* server : m_servers)
{
SERVER* srv = server->m_server_base->server;
if (host == srv->address && srv->port == port)
{
found = server;
break;
}
}
return found;
}
bool MariaDBMonitor::set_replication_credentials(const MXS_CONFIG_PARAMETER* params) bool MariaDBMonitor::set_replication_credentials(const MXS_CONFIG_PARAMETER* params)
{ {
bool rval = false; bool rval = false;
@ -197,6 +214,7 @@ bool MariaDBMonitor::configure(const MXS_CONFIG_PARAMETER* params)
m_detect_stale_slave = config_get_bool(params, "detect_stale_slave"); m_detect_stale_slave = config_get_bool(params, "detect_stale_slave");
m_ignore_external_masters = config_get_bool(params, "ignore_external_masters"); m_ignore_external_masters = config_get_bool(params, "ignore_external_masters");
m_detect_standalone_master = config_get_bool(params, CN_DETECT_STANDALONE_MASTER); m_detect_standalone_master = config_get_bool(params, CN_DETECT_STANDALONE_MASTER);
m_assume_unique_hostnames = config_get_bool(params, CN_ASSUME_UNIQUE_HOSTNAMES);
m_failcount = config_get_integer(params, CN_FAILCOUNT); m_failcount = config_get_integer(params, CN_FAILCOUNT);
m_failover_timeout = config_get_integer(params, CN_FAILOVER_TIMEOUT); m_failover_timeout = config_get_integer(params, CN_FAILOVER_TIMEOUT);
m_switchover_timeout = config_get_integer(params, CN_SWITCHOVER_TIMEOUT); m_switchover_timeout = config_get_integer(params, CN_SWITCHOVER_TIMEOUT);
@ -230,6 +248,25 @@ bool MariaDBMonitor::configure(const MXS_CONFIG_PARAMETER* params)
MXS_ERROR("Both '%s' and '%s' must be defined", CN_REPLICATION_USER, CN_REPLICATION_PASSWORD); MXS_ERROR("Both '%s' and '%s' must be defined", CN_REPLICATION_USER, CN_REPLICATION_PASSWORD);
settings_ok = false; settings_ok = false;
} }
if (!m_assume_unique_hostnames)
{
const char requires[] = "%s requires that %s is on.";
if (m_auto_failover)
{
MXB_ERROR(requires, CN_AUTO_FAILOVER, CN_ASSUME_UNIQUE_HOSTNAMES);
settings_ok = false;
}
if (m_switchover_on_low_disk_space)
{
MXB_ERROR(requires, CN_SWITCHOVER_ON_LOW_DISK_SPACE, CN_ASSUME_UNIQUE_HOSTNAMES);
settings_ok = false;
}
if (m_auto_rejoin)
{
MXB_ERROR(requires, CN_AUTO_REJOIN, CN_ASSUME_UNIQUE_HOSTNAMES);
settings_ok = false;
}
}
return settings_ok; return settings_ok;
} }
@ -1084,6 +1121,9 @@ extern "C" MXS_MODULE* MXS_CREATE_MODULE()
{ {
CN_HANDLE_EVENTS, MXS_MODULE_PARAM_BOOL, "true" CN_HANDLE_EVENTS, MXS_MODULE_PARAM_BOOL, "true"
}, },
{
CN_ASSUME_UNIQUE_HOSTNAMES, MXS_MODULE_PARAM_BOOL, "true"
},
{MXS_END_MODULE_PARAMS} {MXS_END_MODULE_PARAMS}
} }
}; };

View File

@ -200,6 +200,7 @@ private:
* TODO: think about removing */ * TODO: think about removing */
bool m_ignore_external_masters = false; /* Ignore masters outside of the monitor configuration. bool m_ignore_external_masters = false; /* Ignore masters outside of the monitor configuration.
* TODO: requires work */ * TODO: requires work */
bool m_assume_unique_hostnames = true; /* Are server hostnames consistent between MaxScale and servers */
int m_failcount = 1; /* Number of ticks master must be down before it's considered int m_failcount = 1; /* Number of ticks master must be down before it's considered
* totally down, allowing failover or master change. */ * totally down, allowing failover or master change. */
@ -253,6 +254,7 @@ private:
MariaDBServer* get_server_info(MXS_MONITORED_SERVER* db); MariaDBServer* get_server_info(MXS_MONITORED_SERVER* db);
MariaDBServer* get_server(int64_t id); MariaDBServer* get_server(int64_t id);
MariaDBServer* get_server(SERVER* server); MariaDBServer* get_server(SERVER* server);
MariaDBServer* get_server(const std::string& host, int port);
// Cluster discovery and status assignment methods, top levels // Cluster discovery and status assignment methods, top levels
void update_server(MariaDBServer* server); void update_server(MariaDBServer* server);

View File

@ -35,9 +35,11 @@ public:
std::string status; std::string status;
}; };
MariaDBServer::MariaDBServer(MXS_MONITORED_SERVER* monitored_server, int config_index) MariaDBServer::MariaDBServer(MXS_MONITORED_SERVER* monitored_server, int config_index,
bool assume_unique_hostnames)
: m_server_base(monitored_server) : m_server_base(monitored_server)
, m_config_index(config_index) , m_config_index(config_index)
, m_assume_unique_hostnames(assume_unique_hostnames)
{ {
mxb_assert(monitored_server); mxb_assert(monitored_server);
} }
@ -951,8 +953,9 @@ void MariaDBServer::set_status(uint64_t bits)
/** /**
* Compare if the given slave status array is equal to the one stored in the MariaDBServer. * Compare if the given slave status array is equal to the one stored in the MariaDBServer.
* Only compares the parts relevant for building replication topology: master server id:s and * Only compares the parts relevant for building replication topology: slave IO/SQL state,
* slave connection io states. * host:port and master server id:s. When unsure, return false. This must match
* 'build_replication_graph()' in the monitor class.
* *
* @param new_slave_status Right hand side * @param new_slave_status Right hand side
* @return True if equal * @return True if equal
@ -969,10 +972,14 @@ bool MariaDBServer::sstatus_array_topology_equal(const SlaveStatusArray& new_sla
{ {
for (size_t i = 0; i < old_slave_status.size(); i++) for (size_t i = 0; i < old_slave_status.size(); i++)
{ {
// It's enough to check just the following two items, as these are used in const auto new_row = new_slave_status[i];
// 'build_replication_graph'. const auto old_row = old_slave_status[i];
if (old_slave_status[i].slave_io_running != new_slave_status[i].slave_io_running // Strictly speaking, the following should depend on the 'assume_unique_hostnames',
|| old_slave_status[i].master_server_id != new_slave_status[i].master_server_id) // but the situations it would make a difference are so rare they can be ignored.
if (new_row.slave_io_running != old_row.slave_io_running
|| new_row.slave_sql_running != old_row.slave_sql_running
|| new_row.master_host != old_row.master_host || new_row.master_port != old_row.master_port
|| new_row.master_server_id != old_row.master_server_id)
{ {
rval = false; rval = false;
break; break;
@ -1144,19 +1151,40 @@ bool MariaDBServer::can_be_promoted(OperationType op, const MariaDBServer* demot
const SlaveStatus* MariaDBServer::slave_connection_status(const MariaDBServer* target) const const SlaveStatus* MariaDBServer::slave_connection_status(const MariaDBServer* target) const
{ {
// The slave node may have several slave connections, need to find the one that is // The slave node may have several slave connections, need to find the one that is
// connected to the parent. This section is quite similar to the one in // connected to the parent. TODO: Use the information gathered in 'build_replication_graph'
// 'build_replication_graph', although here we require that the sql thread is running. // to skip this function, as the contents are very similar.
auto target_id = target->m_server_id;
const SlaveStatus* rval = NULL; const SlaveStatus* rval = NULL;
for (const SlaveStatus& ss : m_slave_status) if (m_assume_unique_hostnames)
{ {
auto master_id = ss.master_server_id; // Can simply compare host:port.
// Should this check 'Master_Host' and 'Master_Port' instead of server id:s? SERVER* target_srv = target->m_server_base->server;
if (master_id > 0 && master_id == target_id && ss.slave_sql_running && ss.seen_connected string target_host = target_srv->address;
&& ss.slave_io_running != SlaveStatus::SLAVE_IO_NO) int target_port = target_srv->port;
for (const SlaveStatus& ss : m_slave_status)
{ {
rval = &ss; if (ss.master_host == target_host && ss.master_port == target_port &&
break; ss.slave_sql_running && ss.slave_io_running != SlaveStatus::SLAVE_IO_NO)
{
rval = &ss;
break;
}
}
}
else
{
// Compare server id:s instead. If the master's id is wrong (e.g. never updated) this gives a
// wrong result. Also gives wrong result if monitor has never seen the slave connection in the
// connected state.
auto target_id = target->m_server_id;
for (const SlaveStatus& ss : m_slave_status)
{
auto master_id = ss.master_server_id;
if (master_id > 0 && master_id == target_id && ss.slave_sql_running && ss.seen_connected
&& ss.slave_io_running != SlaveStatus::SLAVE_IO_NO)
{
rval = &ss;
break;
}
} }
} }
return rval; return rval;

View File

@ -119,6 +119,8 @@ public:
/* Replication lag of the server. Used during calculation so that the actual SERVER struct is /* Replication lag of the server. Used during calculation so that the actual SERVER struct is
* only written to once. */ * only written to once. */
int m_replication_lag = MXS_RLAG_UNDEFINED; int m_replication_lag = MXS_RLAG_UNDEFINED;
/* Copy of same field in monitor object. TODO: pass in struct when adding concurrent updating. */
bool m_assume_unique_hostnames = true;
/* Has anything that could affect replication topology changed this iteration? /* Has anything that could affect replication topology changed this iteration?
* Causes: server id, slave connections, read-only. */ * Causes: server id, slave connections, read-only. */
bool m_topology_changed = true; bool m_topology_changed = true;
@ -128,7 +130,8 @@ public:
bool m_print_update_errormsg = true; /* Should an update error be printed? */ bool m_print_update_errormsg = true; /* Should an update error be printed? */
MariaDBServer(MXS_MONITORED_SERVER* monitored_server, int config_index); MariaDBServer(MXS_MONITORED_SERVER* monitored_server, int config_index,
bool assume_unique_hostnames = true);
/** /**
* Print server information to a json object. * Print server information to a json object.