MXS-1446: Store more detailed event information

The timestamp of the last change from passive to active is now
tracked. This, with the timestamps of the last master_down and master_up
events, allows detection of cases when MaxScale was failed over but the
failover was not done.

Currently, only a warning is logged if no new master has appeared within
90 seconds of a master_down event and MaxScale was set to active from
passive.

The last event and when the event was triggered is now shown for all
servers. The latest change from passive to active is also shown.
This commit is contained in:
Markus Mäkelä
2017-09-26 09:03:49 +03:00
parent 3e1d89ff17
commit ef2ee38ccf
9 changed files with 108 additions and 16 deletions

View File

@ -27,15 +27,16 @@
#include <sys/stat.h>
#include <maxscale/alloc.h>
#include <mysqld_error.h>
#include <maxscale/paths.h>
#include <maxscale/utils.h>
#include <maxscale/hk_heartbeat.h>
#include <maxscale/json_api.h>
#include <maxscale/log_manager.h>
#include <maxscale/mysql_utils.h>
#include <maxscale/paths.h>
#include <maxscale/pcre2.h>
#include <maxscale/secrets.h>
#include <maxscale/spinlock.h>
#include <maxscale/json_api.h>
#include <maxscale/utils.h>
#include <mysqld_error.h>
#include "maxscale/config.h"
#include "maxscale/externcmd.h"
@ -135,6 +136,7 @@ MXS_MONITOR* monitor_alloc(const char *name, const char *module)
mon->script_timeout = DEFAULT_SCRIPT_TIMEOUT;
mon->parameters = NULL;
mon->server_pending_changes = false;
mon->failover_timeout = DEFAULT_FAILOVER_TIMEOUT;
spinlock_init(&mon->lock);
spinlock_acquire(&monLock);
mon->next = allMonitors;
@ -321,7 +323,8 @@ bool monitorAddServer(MXS_MONITOR *mon, SERVER *server)
db->next = NULL;
db->mon_err_count = 0;
db->log_version_err = true;
db->last_event = UNDEFINED_EVENT;
db->new_event = true;
/** Server status is uninitialized */
db->mon_prev_status = -1;
/* pending status is updated by get_replication_tree */
@ -1048,15 +1051,8 @@ static mxs_monitor_event_t mon_get_event_type(MXS_MONITOR_SERVERS* node)
return rval;
}
/*
* Given a monitor event (enum) provide a text string equivalent
* @param node The monitor server data whose event is wanted
* @result string The name of the monitor event for the server
*/
static const char* mon_get_event_name(MXS_MONITOR_SERVERS* node)
const char* mon_get_event_name(mxs_monitor_event_t event)
{
mxs_monitor_event_t event = mon_get_event_type(node);
for (int i = 0; mxs_monitor_event_enum_values[i].name; i++)
{
if (mxs_monitor_event_enum_values[i].enum_value & event)
@ -1069,6 +1065,16 @@ static const char* mon_get_event_name(MXS_MONITOR_SERVERS* node)
return "undefined_event";
}
/*
* Given a monitor event (enum) provide a text string equivalent
* @param node The monitor server data whose event is wanted
* @result string The name of the monitor event for the server
*/
static const char* mon_get_event_name(MXS_MONITOR_SERVERS* node)
{
return mon_get_event_name(mon_get_event_type(node));
}
enum credentials_approach_t
{
CREDENTIALS_INCLUDE,
@ -1718,13 +1724,55 @@ void mon_process_state_changes(MXS_MONITOR *monitor, const char *script, uint64_
* In this case, a failover script should be called if no master_up
* or new_master events are triggered within a pre-defined time limit.
*/
ptr->last_event = mon_get_event_type(ptr);
mxs_monitor_event_t event = mon_get_event_type(ptr);
ptr->server->last_event = event;
ptr->server->triggered_at = hkheartbeat;
ptr->new_event = true;
if (event == MASTER_DOWN_EVENT)
{
monitor->last_master_down = hkheartbeat;
}
else if (event == MASTER_UP_EVENT || event == NEW_MASTER_EVENT)
{
monitor->last_master_up = hkheartbeat;
}
if (script && (events & mon_get_event_type(ptr)))
{
monitor_launch_script(monitor, ptr, script);
}
}
else
{
/**
* If a master_down event was triggered when this MaxScale was
* passive, we need to execute the failover script again if no new
* masters have appeared and this MaxScale has been set as active
* since the event took place.
*/
MXS_CONFIG* cnf = config_get_global_options();
if (!cnf->passive && // This is not a passive MaxScale
ptr->server->last_event == MASTER_DOWN_EVENT && // This is a master that went down
cnf->promoted_at >= ptr->server->triggered_at && // Promoted to active after the event took place
ptr->new_event && // Event has not yet been processed
monitor->last_master_down > monitor->last_master_up) // Latest relevant event
{
// Scale the timeout to heartbeat resolution which is 1/10th of a second
int64_t timeout = (int64_t)monitor->failover_timeout * 10;
int64_t t = hkheartbeat - ptr->server->triggered_at;
if (t > timeout)
{
MXS_WARNING("Failover of server '%s' did not take place within "
"%u seconds, failover script needs to be re-triggered",
ptr->server->unique_name, monitor->failover_timeout);
// TODO: Launch the failover script
ptr->new_event = false;
}
}
}
}
}