MXS-1905 Set slaves with low disk space to maintenance

Also, servers in maintenance are updated just as other servers.
This commit is contained in:
Esa Korhonen 2018-06-27 16:39:32 +03:00
parent 8d7cb27884
commit fd31c9cced
4 changed files with 69 additions and 42 deletions

View File

@ -262,6 +262,14 @@ Note that once the server has been put in maintenance mode, the disk space
situation will no longer be monitored and the server will thus not automatically
be taken out of maintanance mode even if disk space again would become available.
### `maintenance_on_low_disk_space`
This feature is enabled by default. If a running server that is not the master
or a relay master is out of disk space (as defined by the general monitor
setting `disk_space_threshold`) the server is set to maintenance mode. Such
servers are not used for router sessions and are ignored when performing a
failover or other cluster modification operation.
## Failover, switchover and auto-rejoin
Starting with MaxScale 2.2.1, MariaDB Monitor supports replication cluster

View File

@ -1610,3 +1610,16 @@ void MariaDBMonitor::enforce_read_only_on_slaves()
}
}
}
void MariaDBMonitor::set_low_disk_slaves_maintenance()
{
// Only set pure slave and standalone servers to maintenance.
for (MariaDBServer* server : m_servers)
{
if (server->has_status(SERVER_DISK_SPACE_EXHAUSTED) && server->is_running() &&
!server->is_master() && !server->is_relay_server())
{
server->set_status(SERVER_MAINT);
}
}
}

View File

@ -42,7 +42,7 @@ static const char CN_NO_PROMOTE_SERVERS[] = "servers_no_promotion";
static const char CN_FAILOVER_TIMEOUT[] = "failover_timeout";
static const char CN_SWITCHOVER_ON_LOW_DISK_SPACE[] = "switchover_on_low_disk_space";
static const char CN_SWITCHOVER_TIMEOUT[] = "switchover_timeout";
static const char CN_MAINTENANCE_ON_LOW_DISK_SPACE[] = "maintenance_on_low_disk_space";
// Parameters for master failure verification and timeout
static const char CN_VERIFY_MASTER_FAILURE[] = "verify_master_failure";
static const char CN_MASTER_FAILURE_TIMEOUT[] = "master_failure_timeout";
@ -198,6 +198,7 @@ bool MariaDBMonitor::configure(const MXS_CONFIG_PARAMETER* params)
m_promote_sql_file = config_get_string(params, CN_PROMOTION_SQL_FILE);
m_demote_sql_file = config_get_string(params, CN_DEMOTION_SQL_FILE);
m_switchover_on_low_disk_space = config_get_bool(params, CN_SWITCHOVER_ON_LOW_DISK_SPACE);
m_maintenance_on_low_disk_space = config_get_bool(params, CN_MAINTENANCE_ON_LOW_DISK_SPACE);
m_excluded_servers.clear();
MXS_MONITORED_SERVER** excluded_array = NULL;
@ -296,65 +297,61 @@ json_t* MariaDBMonitor::diagnostics_json() const
void MariaDBMonitor::update_server(MariaDBServer& server)
{
MXS_MONITORED_SERVER* mon_srv = server.m_server_base;
/* Monitor server if not in maintenance. */
bool in_maintenance = server.is_in_maintenance();
if (!in_maintenance)
{
mxs_connect_result_t conn_status = mon_ping_or_connect_to_db(m_monitor, mon_srv);
MYSQL* conn = mon_srv->con; // mon_ping_or_connect_to_db() may have reallocated the MYSQL struct.
mxs_connect_result_t conn_status = mon_ping_or_connect_to_db(m_monitor, mon_srv);
MYSQL* conn = mon_srv->con; // mon_ping_or_connect_to_db() may have reallocated the MYSQL struct.
if (mon_connection_is_ok(conn_status))
if (mon_connection_is_ok(conn_status))
{
server.set_status(SERVER_RUNNING);
if (conn_status == MONITOR_CONN_NEWCONN_OK)
{
server.set_status(SERVER_RUNNING);
if (conn_status == MONITOR_CONN_NEWCONN_OK)
// Is a new connection or a reconnection. Check server version.
server.update_server_version();
}
if (server.m_version != MariaDBServer::version::UNKNOWN)
{
// Check permissions if permissions failed last time or if this is a new connection.
if (server.had_status(SERVER_AUTH_ERROR) || conn_status == MONITOR_CONN_NEWCONN_OK)
{
// Is a new connection or a reconnection. Check server version.
server.update_server_version();
server.check_permissions();
}
if (server.m_version != MariaDBServer::version::UNKNOWN)
// If permissions are ok, continue.
if (!server.has_status(SERVER_AUTH_ERROR))
{
// Check permissions if permissions failed last time or if this is a new connection.
if (server.had_status(SERVER_AUTH_ERROR) || conn_status == MONITOR_CONN_NEWCONN_OK)
if (should_update_disk_space_status(mon_srv))
{
server.check_permissions();
update_disk_space_status(mon_srv);
}
// If permissions are ok, continue.
if (!server.has_status(SERVER_AUTH_ERROR))
{
if (should_update_disk_space_status(mon_srv))
{
update_disk_space_status(mon_srv);
}
// Query MariaDBServer specific data
server.monitor_server();
}
// Query MariaDBServer specific data
server.monitor_server();
}
}
else
}
else
{
/* The current server is not running. Clear all but the stale master bit as it is used to detect
* masters that went down but came up. */
server.clear_status(~SERVER_WAS_MASTER);
auto conn_errno = mysql_errno(conn);
if (conn_errno == ER_ACCESS_DENIED_ERROR || conn_errno == ER_ACCESS_DENIED_NO_PASSWORD_ERROR)
{
/* The current server is not running. Clear all but the stale master bit as it is used to detect
* masters that went down but came up. */
server.clear_status(~SERVER_WAS_MASTER);
auto conn_errno = mysql_errno(conn);
if (conn_errno == ER_ACCESS_DENIED_ERROR || conn_errno == ER_ACCESS_DENIED_NO_PASSWORD_ERROR)
{
server.set_status(SERVER_AUTH_ERROR);
}
server.set_status(SERVER_AUTH_ERROR);
}
/* Log connect failure only once, that is, if server was RUNNING or MAINTENANCE during last
* iteration. */
if (mon_srv->mon_prev_status & (SERVER_RUNNING | SERVER_MAINT))
{
mon_log_connect_error(mon_srv, conn_status);
}
/* Log connect failure only once, that is, if server was RUNNING or MAINTENANCE during last
* iteration. */
if (mon_srv->mon_prev_status & (SERVER_RUNNING | SERVER_MAINT))
{
mon_log_connect_error(mon_srv, conn_status);
}
}
/** Increase or reset the error count of the server. */
bool is_running = server.is_running();
bool in_maintenance = server.is_in_maintenance();
mon_srv->mon_err_count = (is_running || in_maintenance) ? 0 : mon_srv->mon_err_count + 1;
}
@ -526,6 +523,11 @@ void MariaDBMonitor::tick()
measure_replication_lag();
}
if (m_maintenance_on_low_disk_space)
{
set_low_disk_slaves_maintenance();
}
// Update shared status. The next functions read the shared status. TODO: change the following
// functions to read "pending_status" instead.
for (auto mon_srv = m_monitor->monitored_servers; mon_srv; mon_srv = mon_srv->next)
@ -1304,6 +1306,7 @@ extern "C" MXS_MODULE* MXS_CREATE_MODULE()
{CN_PROMOTION_SQL_FILE, MXS_MODULE_PARAM_PATH},
{CN_DEMOTION_SQL_FILE, MXS_MODULE_PARAM_PATH},
{CN_SWITCHOVER_ON_LOW_DISK_SPACE, MXS_MODULE_PARAM_BOOL, "false"},
{CN_MAINTENANCE_ON_LOW_DISK_SPACE, MXS_MODULE_PARAM_BOOL, "true"},
{MXS_END_MODULE_PARAMS}
}
};

View File

@ -168,6 +168,8 @@ private:
std::string m_demote_sql_file; /**< File with sql commands which are ran to a server being demoted. */
bool m_enforce_read_only_slaves; /**< Should the monitor set read-only=1 on any slave servers. */
bool m_switchover_on_low_disk_space; /**< Should the monitor do a switchover on low disk space. */
bool m_maintenance_on_low_disk_space; /**< Set slave and unreplicating servers with low disk space to
* maintenance. */
// Other settings
std::string m_script; /**< Script to call when state changes occur on servers */
@ -215,6 +217,7 @@ private:
bool master_is_valid(std::string* reason_out);
bool cycle_has_master_server(ServerArray& cycle_servers);
void update_master_cycle_info();
void set_low_disk_slaves_maintenance();
// Switchover methods
bool manual_switchover(SERVER* new_master, SERVER* current_master, json_t** error_out);