MXS-1446: Store more detailed event information

The timestamp of the last change from passive to active is now
tracked. This, with the timestamps of the last master_down and master_up
events, allows detection of cases when MaxScale was failed over but the
failover was not done.

Currently, only a warning is logged if no new master has appeared within
90 seconds of a master_down event and MaxScale was set to active from
passive.

The last event and when the event was triggered is now shown for all
servers. The latest change from passive to active is also shown.
This commit is contained in:
Markus Mäkelä 2017-09-26 09:03:49 +03:00
parent 3e1d89ff17
commit ef2ee38ccf
9 changed files with 108 additions and 16 deletions

View File

@ -210,6 +210,8 @@ typedef struct
unsigned int auth_write_timeout; /**< Write timeout for the user authentication */
bool skip_permission_checks; /**< Skip service and monitor permission checks */
bool passive; /**< True if MaxScale is in passive mode */
int64_t promoted_at; /**< Time when this Maxscale instance was
* promoted from a passive to an active */
char qc_name[PATH_MAX]; /**< The name of the query classifier to load */
char* qc_args; /**< Arguments for the query classifier */
qc_sql_mode_t qc_sql_mode; /**< The query classifier sql mode */

View File

@ -169,7 +169,7 @@ typedef struct monitor_servers
int mon_err_count;
unsigned int mon_prev_status;
unsigned int pending_status; /**< Pending Status flag bitmap */
mxs_monitor_event_t last_event; /**< The last event that occurred on this server*/
bool new_event; /**< Whether an action was taken on the last event */
struct monitor_servers *next; /**< The next server in the list */
} MXS_MONITOR_SERVERS;
@ -205,6 +205,9 @@ struct mxs_monitor
bool active; /**< True if monitor is active */
time_t journal_max_age; /**< Maximum age of journal file */
uint32_t script_timeout; /**< Timeout in seconds for the monitor scripts */
int32_t failover_timeout; /**< Timeout in seconds for failover script */
int64_t last_master_up; /**< Time when the last master_up event was triggered */
int64_t last_master_down; /**< Time when the last master_down event was triggered */
struct mxs_monitor *next; /**< Next monitor in the linked list */
};
@ -266,6 +269,7 @@ bool mon_print_fail_status(MXS_MONITOR_SERVERS* mon_srv);
mxs_connect_result_t mon_ping_or_connect_to_db(MXS_MONITOR* mon, MXS_MONITOR_SERVERS *database);
void mon_log_connect_error(MXS_MONITOR_SERVERS* database, mxs_connect_result_t rval);
const char* mon_get_event_name(mxs_monitor_event_t event);
void lock_monitor_servers(MXS_MONITOR *monitor);
void release_monitor_servers(MXS_MONITOR *monitor);

View File

@ -135,6 +135,8 @@ typedef struct server
uint8_t charset; /**< Default server character set */
bool is_active; /**< Server is active and has not been "destroyed" */
bool proxy_protocol; /**< Send proxy-protocol header to backend when connecting client sessions. */
int last_event; /**< The last event that occurred on this server */
int64_t triggered_at; /**< Time when the last event was triggered */
#if defined(SS_DEBUG)
skygw_chk_t server_chk_tail;
#endif

View File

@ -30,6 +30,8 @@ exports.builder = function(yargs) {
{'Address': 'attributes.parameters.address'},
{'Port': 'attributes.parameters.port'},
{'State': 'attributes.state'},
{'Last Event': 'attributes.last_event'},
{'Triggered At': 'attributes.triggered_at'},
{'Services': 'relationships.services.data[].id'},
{'Monitors': 'relationships.monitors.data[].id'},
{'Master ID': 'attributes.master_id'},
@ -131,6 +133,7 @@ exports.builder = function(yargs) {
{'Version': 'attributes.version'},
{'Commit': 'attributes.commit'},
{'Started At': 'attributes.started_at'},
{'Activated At': 'attributes.activated_at'},
{'Uptime': 'attributes.uptime'},
{'Parameters': 'attributes.parameters'}
])

View File

@ -43,6 +43,7 @@
#include <maxscale/http.hh>
#include <maxscale/version.h>
#include <maxscale/maxscale.h>
#include <maxscale/hk_heartbeat.h>
#include "maxscale/config.h"
#include "maxscale/filter.h"
@ -1761,6 +1762,7 @@ void config_set_global_defaults()
gateway.admin_ssl_cert[0] = '\0';
gateway.admin_ssl_ca_cert[0] = '\0';
gateway.passive = false;
gateway.promoted_at = 0;
gateway.thread_stack_size = 0;
pthread_attr_t attr;
@ -3913,10 +3915,13 @@ json_t* config_maxscale_to_json(const char* host)
}
json_t* attr = json_object();
time_t started = maxscale_started();
time_t activated = started + (cnf->promoted_at / 10);
json_object_set_new(attr, CN_PARAMETERS, param);
json_object_set_new(attr, "version", json_string(MAXSCALE_VERSION));
json_object_set_new(attr, "commit", json_string(MAXSCALE_COMMIT));
json_object_set_new(attr, "started_at", json_string(http_to_date(maxscale_started()).c_str()));
json_object_set_new(attr, "started_at", json_string(http_to_date(started).c_str()));
json_object_set_new(attr, "activated_at", json_string(http_to_date(activated).c_str()));
json_object_set_new(attr, "uptime", json_integer(maxscale_uptime()));
json_t* obj = json_object();

View File

@ -23,6 +23,7 @@
#include <algorithm>
#include <maxscale/atomic.h>
#include <maxscale/hk_heartbeat.h>
#include <maxscale/jansson.hh>
#include <maxscale/json_api.h>
#include <maxscale/paths.h>
@ -758,6 +759,13 @@ bool runtime_alter_maxscale(const char* name, const char* value)
MXS_NOTICE("Updated '%s' from '%s' to '%s'", CN_PASSIVE,
cnf.passive ? "true" : "false",
boolval ? "true" : "false");
if (cnf.passive && !boolval)
{
// This MaxScale is being promoted to the active instance
cnf.promoted_at = hkheartbeat;
}
cnf.passive = boolval;
rval = true;
}

View File

@ -35,6 +35,9 @@ MXS_BEGIN_DECLS
/** Default script execution timeout in seconds */
#define DEFAULT_SCRIPT_TIMEOUT 90
/** Default failover script timeout */
#define DEFAULT_FAILOVER_TIMEOUT 90
/**
* Monitor network timeout types
*/

View File

@ -27,15 +27,16 @@
#include <sys/stat.h>
#include <maxscale/alloc.h>
#include <mysqld_error.h>
#include <maxscale/paths.h>
#include <maxscale/utils.h>
#include <maxscale/hk_heartbeat.h>
#include <maxscale/json_api.h>
#include <maxscale/log_manager.h>
#include <maxscale/mysql_utils.h>
#include <maxscale/paths.h>
#include <maxscale/pcre2.h>
#include <maxscale/secrets.h>
#include <maxscale/spinlock.h>
#include <maxscale/json_api.h>
#include <maxscale/utils.h>
#include <mysqld_error.h>
#include "maxscale/config.h"
#include "maxscale/externcmd.h"
@ -135,6 +136,7 @@ MXS_MONITOR* monitor_alloc(const char *name, const char *module)
mon->script_timeout = DEFAULT_SCRIPT_TIMEOUT;
mon->parameters = NULL;
mon->server_pending_changes = false;
mon->failover_timeout = DEFAULT_FAILOVER_TIMEOUT;
spinlock_init(&mon->lock);
spinlock_acquire(&monLock);
mon->next = allMonitors;
@ -321,7 +323,8 @@ bool monitorAddServer(MXS_MONITOR *mon, SERVER *server)
db->next = NULL;
db->mon_err_count = 0;
db->log_version_err = true;
db->last_event = UNDEFINED_EVENT;
db->new_event = true;
/** Server status is uninitialized */
db->mon_prev_status = -1;
/* pending status is updated by get_replication_tree */
@ -1048,15 +1051,8 @@ static mxs_monitor_event_t mon_get_event_type(MXS_MONITOR_SERVERS* node)
return rval;
}
/*
* Given a monitor event (enum) provide a text string equivalent
* @param node The monitor server data whose event is wanted
* @result string The name of the monitor event for the server
*/
static const char* mon_get_event_name(MXS_MONITOR_SERVERS* node)
const char* mon_get_event_name(mxs_monitor_event_t event)
{
mxs_monitor_event_t event = mon_get_event_type(node);
for (int i = 0; mxs_monitor_event_enum_values[i].name; i++)
{
if (mxs_monitor_event_enum_values[i].enum_value & event)
@ -1069,6 +1065,16 @@ static const char* mon_get_event_name(MXS_MONITOR_SERVERS* node)
return "undefined_event";
}
/*
* Given a monitor event (enum) provide a text string equivalent
* @param node The monitor server data whose event is wanted
* @result string The name of the monitor event for the server
*/
static const char* mon_get_event_name(MXS_MONITOR_SERVERS* node)
{
return mon_get_event_name(mon_get_event_type(node));
}
enum credentials_approach_t
{
CREDENTIALS_INCLUDE,
@ -1718,13 +1724,55 @@ void mon_process_state_changes(MXS_MONITOR *monitor, const char *script, uint64_
* In this case, a failover script should be called if no master_up
* or new_master events are triggered within a pre-defined time limit.
*/
ptr->last_event = mon_get_event_type(ptr);
mxs_monitor_event_t event = mon_get_event_type(ptr);
ptr->server->last_event = event;
ptr->server->triggered_at = hkheartbeat;
ptr->new_event = true;
if (event == MASTER_DOWN_EVENT)
{
monitor->last_master_down = hkheartbeat;
}
else if (event == MASTER_UP_EVENT || event == NEW_MASTER_EVENT)
{
monitor->last_master_up = hkheartbeat;
}
if (script && (events & mon_get_event_type(ptr)))
{
monitor_launch_script(monitor, ptr, script);
}
}
else
{
/**
* If a master_down event was triggered when this MaxScale was
* passive, we need to execute the failover script again if no new
* masters have appeared and this MaxScale has been set as active
* since the event took place.
*/
MXS_CONFIG* cnf = config_get_global_options();
if (!cnf->passive && // This is not a passive MaxScale
ptr->server->last_event == MASTER_DOWN_EVENT && // This is a master that went down
cnf->promoted_at >= ptr->server->triggered_at && // Promoted to active after the event took place
ptr->new_event && // Event has not yet been processed
monitor->last_master_down > monitor->last_master_up) // Latest relevant event
{
// Scale the timeout to heartbeat resolution which is 1/10th of a second
int64_t timeout = (int64_t)monitor->failover_timeout * 10;
int64_t t = hkheartbeat - ptr->server->triggered_at;
if (t > timeout)
{
MXS_WARNING("Failover of server '%s' did not take place within "
"%u seconds, failover script needs to be re-triggered",
ptr->server->unique_name, monitor->failover_timeout);
// TODO: Launch the failover script
ptr->new_event = false;
}
}
}
}
}

View File

@ -37,6 +37,9 @@
#include <maxscale/utils.h>
#include <maxscale/semaphore.hh>
#include <maxscale/json_api.h>
#include <maxscale/hk_heartbeat.h>
#include <maxscale/http.hh>
#include <maxscale/maxscale.h>
#include "maxscale/monitor.h"
#include "maxscale/poll.h"
@ -142,6 +145,10 @@ SERVER* server_alloc(const char *name, const char *address, unsigned short port,
server->charset = SERVER_DEFAULT_CHARSET;
server->proxy_protocol = false;
// Set last event to server_up as the server is in Running state on startup
server->last_event = SERVER_UP_EVENT;
server->triggered_at = 0;
spinlock_acquire(&server_spin);
server->next = allServers;
allServers = server;
@ -496,6 +503,11 @@ dprintServer(DCB *dcb, const SERVER *server)
dcb_printf(dcb, "\tServer Version: %s\n", server->version_string);
dcb_printf(dcb, "\tNode Id: %ld\n", server->node_id);
dcb_printf(dcb, "\tMaster Id: %ld\n", server->master_id);
dcb_printf(dcb, "\tLast event: %s\n",
mon_get_event_name((mxs_monitor_event_t)server->last_event));
time_t t = maxscale_started() + (server->triggered_at / 10);
dcb_printf(dcb, "\tTriggered at: %s\n", http_to_date(t).c_str());
if (server->slaves)
{
int i;
@ -1451,6 +1463,11 @@ static json_t* server_json_attributes(const SERVER* server)
json_object_set_new(attr, "master_id", json_integer(server->master_id));
json_object_set_new(attr, "replication_depth", json_integer(server->depth));
const char* event_name = mon_get_event_name((mxs_monitor_event_t)server->last_event);
time_t t = maxscale_started() + (server->triggered_at / 10);
json_object_set_new(attr, "last_event", json_string(event_name));
json_object_set_new(attr, "triggered_at", json_string(http_to_date(t).c_str()));
if (server->slaves)
{
json_t* slaves = json_array();