MXS-1446: Store more detailed event information
The timestamp of the last change from passive to active is now tracked. This, with the timestamps of the last master_down and master_up events, allows detection of cases when MaxScale was failed over but the failover was not done. Currently, only a warning is logged if no new master has appeared within 90 seconds of a master_down event and MaxScale was set to active from passive. The last event and when the event was triggered is now shown for all servers. The latest change from passive to active is also shown.
This commit is contained in:
parent
3e1d89ff17
commit
ef2ee38ccf
@ -210,6 +210,8 @@ typedef struct
|
||||
unsigned int auth_write_timeout; /**< Write timeout for the user authentication */
|
||||
bool skip_permission_checks; /**< Skip service and monitor permission checks */
|
||||
bool passive; /**< True if MaxScale is in passive mode */
|
||||
int64_t promoted_at; /**< Time when this Maxscale instance was
|
||||
* promoted from a passive to an active */
|
||||
char qc_name[PATH_MAX]; /**< The name of the query classifier to load */
|
||||
char* qc_args; /**< Arguments for the query classifier */
|
||||
qc_sql_mode_t qc_sql_mode; /**< The query classifier sql mode */
|
||||
|
@ -169,7 +169,7 @@ typedef struct monitor_servers
|
||||
int mon_err_count;
|
||||
unsigned int mon_prev_status;
|
||||
unsigned int pending_status; /**< Pending Status flag bitmap */
|
||||
mxs_monitor_event_t last_event; /**< The last event that occurred on this server*/
|
||||
bool new_event; /**< Whether an action was taken on the last event */
|
||||
struct monitor_servers *next; /**< The next server in the list */
|
||||
} MXS_MONITOR_SERVERS;
|
||||
|
||||
@ -205,6 +205,9 @@ struct mxs_monitor
|
||||
bool active; /**< True if monitor is active */
|
||||
time_t journal_max_age; /**< Maximum age of journal file */
|
||||
uint32_t script_timeout; /**< Timeout in seconds for the monitor scripts */
|
||||
int32_t failover_timeout; /**< Timeout in seconds for failover script */
|
||||
int64_t last_master_up; /**< Time when the last master_up event was triggered */
|
||||
int64_t last_master_down; /**< Time when the last master_down event was triggered */
|
||||
struct mxs_monitor *next; /**< Next monitor in the linked list */
|
||||
};
|
||||
|
||||
@ -266,6 +269,7 @@ bool mon_print_fail_status(MXS_MONITOR_SERVERS* mon_srv);
|
||||
|
||||
mxs_connect_result_t mon_ping_or_connect_to_db(MXS_MONITOR* mon, MXS_MONITOR_SERVERS *database);
|
||||
void mon_log_connect_error(MXS_MONITOR_SERVERS* database, mxs_connect_result_t rval);
|
||||
const char* mon_get_event_name(mxs_monitor_event_t event);
|
||||
|
||||
void lock_monitor_servers(MXS_MONITOR *monitor);
|
||||
void release_monitor_servers(MXS_MONITOR *monitor);
|
||||
|
@ -135,6 +135,8 @@ typedef struct server
|
||||
uint8_t charset; /**< Default server character set */
|
||||
bool is_active; /**< Server is active and has not been "destroyed" */
|
||||
bool proxy_protocol; /**< Send proxy-protocol header to backend when connecting client sessions. */
|
||||
int last_event; /**< The last event that occurred on this server */
|
||||
int64_t triggered_at; /**< Time when the last event was triggered */
|
||||
#if defined(SS_DEBUG)
|
||||
skygw_chk_t server_chk_tail;
|
||||
#endif
|
||||
|
@ -30,6 +30,8 @@ exports.builder = function(yargs) {
|
||||
{'Address': 'attributes.parameters.address'},
|
||||
{'Port': 'attributes.parameters.port'},
|
||||
{'State': 'attributes.state'},
|
||||
{'Last Event': 'attributes.last_event'},
|
||||
{'Triggered At': 'attributes.triggered_at'},
|
||||
{'Services': 'relationships.services.data[].id'},
|
||||
{'Monitors': 'relationships.monitors.data[].id'},
|
||||
{'Master ID': 'attributes.master_id'},
|
||||
@ -131,6 +133,7 @@ exports.builder = function(yargs) {
|
||||
{'Version': 'attributes.version'},
|
||||
{'Commit': 'attributes.commit'},
|
||||
{'Started At': 'attributes.started_at'},
|
||||
{'Activated At': 'attributes.activated_at'},
|
||||
{'Uptime': 'attributes.uptime'},
|
||||
{'Parameters': 'attributes.parameters'}
|
||||
])
|
||||
|
@ -43,6 +43,7 @@
|
||||
#include <maxscale/http.hh>
|
||||
#include <maxscale/version.h>
|
||||
#include <maxscale/maxscale.h>
|
||||
#include <maxscale/hk_heartbeat.h>
|
||||
|
||||
#include "maxscale/config.h"
|
||||
#include "maxscale/filter.h"
|
||||
@ -1761,6 +1762,7 @@ void config_set_global_defaults()
|
||||
gateway.admin_ssl_cert[0] = '\0';
|
||||
gateway.admin_ssl_ca_cert[0] = '\0';
|
||||
gateway.passive = false;
|
||||
gateway.promoted_at = 0;
|
||||
|
||||
gateway.thread_stack_size = 0;
|
||||
pthread_attr_t attr;
|
||||
@ -3913,10 +3915,13 @@ json_t* config_maxscale_to_json(const char* host)
|
||||
}
|
||||
|
||||
json_t* attr = json_object();
|
||||
time_t started = maxscale_started();
|
||||
time_t activated = started + (cnf->promoted_at / 10);
|
||||
json_object_set_new(attr, CN_PARAMETERS, param);
|
||||
json_object_set_new(attr, "version", json_string(MAXSCALE_VERSION));
|
||||
json_object_set_new(attr, "commit", json_string(MAXSCALE_COMMIT));
|
||||
json_object_set_new(attr, "started_at", json_string(http_to_date(maxscale_started()).c_str()));
|
||||
json_object_set_new(attr, "started_at", json_string(http_to_date(started).c_str()));
|
||||
json_object_set_new(attr, "activated_at", json_string(http_to_date(activated).c_str()));
|
||||
json_object_set_new(attr, "uptime", json_integer(maxscale_uptime()));
|
||||
|
||||
json_t* obj = json_object();
|
||||
|
@ -23,6 +23,7 @@
|
||||
#include <algorithm>
|
||||
|
||||
#include <maxscale/atomic.h>
|
||||
#include <maxscale/hk_heartbeat.h>
|
||||
#include <maxscale/jansson.hh>
|
||||
#include <maxscale/json_api.h>
|
||||
#include <maxscale/paths.h>
|
||||
@ -758,6 +759,13 @@ bool runtime_alter_maxscale(const char* name, const char* value)
|
||||
MXS_NOTICE("Updated '%s' from '%s' to '%s'", CN_PASSIVE,
|
||||
cnf.passive ? "true" : "false",
|
||||
boolval ? "true" : "false");
|
||||
|
||||
if (cnf.passive && !boolval)
|
||||
{
|
||||
// This MaxScale is being promoted to the active instance
|
||||
cnf.promoted_at = hkheartbeat;
|
||||
}
|
||||
|
||||
cnf.passive = boolval;
|
||||
rval = true;
|
||||
}
|
||||
|
@ -35,6 +35,9 @@ MXS_BEGIN_DECLS
|
||||
/** Default script execution timeout in seconds */
|
||||
#define DEFAULT_SCRIPT_TIMEOUT 90
|
||||
|
||||
/** Default failover script timeout */
|
||||
#define DEFAULT_FAILOVER_TIMEOUT 90
|
||||
|
||||
/**
|
||||
* Monitor network timeout types
|
||||
*/
|
||||
|
@ -27,15 +27,16 @@
|
||||
#include <sys/stat.h>
|
||||
|
||||
#include <maxscale/alloc.h>
|
||||
#include <mysqld_error.h>
|
||||
#include <maxscale/paths.h>
|
||||
#include <maxscale/utils.h>
|
||||
#include <maxscale/hk_heartbeat.h>
|
||||
#include <maxscale/json_api.h>
|
||||
#include <maxscale/log_manager.h>
|
||||
#include <maxscale/mysql_utils.h>
|
||||
#include <maxscale/paths.h>
|
||||
#include <maxscale/pcre2.h>
|
||||
#include <maxscale/secrets.h>
|
||||
#include <maxscale/spinlock.h>
|
||||
#include <maxscale/json_api.h>
|
||||
#include <maxscale/utils.h>
|
||||
#include <mysqld_error.h>
|
||||
|
||||
#include "maxscale/config.h"
|
||||
#include "maxscale/externcmd.h"
|
||||
@ -135,6 +136,7 @@ MXS_MONITOR* monitor_alloc(const char *name, const char *module)
|
||||
mon->script_timeout = DEFAULT_SCRIPT_TIMEOUT;
|
||||
mon->parameters = NULL;
|
||||
mon->server_pending_changes = false;
|
||||
mon->failover_timeout = DEFAULT_FAILOVER_TIMEOUT;
|
||||
spinlock_init(&mon->lock);
|
||||
spinlock_acquire(&monLock);
|
||||
mon->next = allMonitors;
|
||||
@ -321,7 +323,8 @@ bool monitorAddServer(MXS_MONITOR *mon, SERVER *server)
|
||||
db->next = NULL;
|
||||
db->mon_err_count = 0;
|
||||
db->log_version_err = true;
|
||||
db->last_event = UNDEFINED_EVENT;
|
||||
db->new_event = true;
|
||||
|
||||
/** Server status is uninitialized */
|
||||
db->mon_prev_status = -1;
|
||||
/* pending status is updated by get_replication_tree */
|
||||
@ -1048,15 +1051,8 @@ static mxs_monitor_event_t mon_get_event_type(MXS_MONITOR_SERVERS* node)
|
||||
return rval;
|
||||
}
|
||||
|
||||
/*
|
||||
* Given a monitor event (enum) provide a text string equivalent
|
||||
* @param node The monitor server data whose event is wanted
|
||||
* @result string The name of the monitor event for the server
|
||||
*/
|
||||
static const char* mon_get_event_name(MXS_MONITOR_SERVERS* node)
|
||||
const char* mon_get_event_name(mxs_monitor_event_t event)
|
||||
{
|
||||
mxs_monitor_event_t event = mon_get_event_type(node);
|
||||
|
||||
for (int i = 0; mxs_monitor_event_enum_values[i].name; i++)
|
||||
{
|
||||
if (mxs_monitor_event_enum_values[i].enum_value & event)
|
||||
@ -1069,6 +1065,16 @@ static const char* mon_get_event_name(MXS_MONITOR_SERVERS* node)
|
||||
return "undefined_event";
|
||||
}
|
||||
|
||||
/*
|
||||
* Given a monitor event (enum) provide a text string equivalent
|
||||
* @param node The monitor server data whose event is wanted
|
||||
* @result string The name of the monitor event for the server
|
||||
*/
|
||||
static const char* mon_get_event_name(MXS_MONITOR_SERVERS* node)
|
||||
{
|
||||
return mon_get_event_name(mon_get_event_type(node));
|
||||
}
|
||||
|
||||
enum credentials_approach_t
|
||||
{
|
||||
CREDENTIALS_INCLUDE,
|
||||
@ -1718,13 +1724,55 @@ void mon_process_state_changes(MXS_MONITOR *monitor, const char *script, uint64_
|
||||
* In this case, a failover script should be called if no master_up
|
||||
* or new_master events are triggered within a pre-defined time limit.
|
||||
*/
|
||||
ptr->last_event = mon_get_event_type(ptr);
|
||||
mxs_monitor_event_t event = mon_get_event_type(ptr);
|
||||
ptr->server->last_event = event;
|
||||
ptr->server->triggered_at = hkheartbeat;
|
||||
ptr->new_event = true;
|
||||
|
||||
if (event == MASTER_DOWN_EVENT)
|
||||
{
|
||||
monitor->last_master_down = hkheartbeat;
|
||||
}
|
||||
else if (event == MASTER_UP_EVENT || event == NEW_MASTER_EVENT)
|
||||
{
|
||||
monitor->last_master_up = hkheartbeat;
|
||||
}
|
||||
|
||||
if (script && (events & mon_get_event_type(ptr)))
|
||||
{
|
||||
monitor_launch_script(monitor, ptr, script);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/**
|
||||
* If a master_down event was triggered when this MaxScale was
|
||||
* passive, we need to execute the failover script again if no new
|
||||
* masters have appeared and this MaxScale has been set as active
|
||||
* since the event took place.
|
||||
*/
|
||||
MXS_CONFIG* cnf = config_get_global_options();
|
||||
|
||||
if (!cnf->passive && // This is not a passive MaxScale
|
||||
ptr->server->last_event == MASTER_DOWN_EVENT && // This is a master that went down
|
||||
cnf->promoted_at >= ptr->server->triggered_at && // Promoted to active after the event took place
|
||||
ptr->new_event && // Event has not yet been processed
|
||||
monitor->last_master_down > monitor->last_master_up) // Latest relevant event
|
||||
{
|
||||
// Scale the timeout to heartbeat resolution which is 1/10th of a second
|
||||
int64_t timeout = (int64_t)monitor->failover_timeout * 10;
|
||||
int64_t t = hkheartbeat - ptr->server->triggered_at;
|
||||
|
||||
if (t > timeout)
|
||||
{
|
||||
MXS_WARNING("Failover of server '%s' did not take place within "
|
||||
"%u seconds, failover script needs to be re-triggered",
|
||||
ptr->server->unique_name, monitor->failover_timeout);
|
||||
// TODO: Launch the failover script
|
||||
ptr->new_event = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -37,6 +37,9 @@
|
||||
#include <maxscale/utils.h>
|
||||
#include <maxscale/semaphore.hh>
|
||||
#include <maxscale/json_api.h>
|
||||
#include <maxscale/hk_heartbeat.h>
|
||||
#include <maxscale/http.hh>
|
||||
#include <maxscale/maxscale.h>
|
||||
|
||||
#include "maxscale/monitor.h"
|
||||
#include "maxscale/poll.h"
|
||||
@ -142,6 +145,10 @@ SERVER* server_alloc(const char *name, const char *address, unsigned short port,
|
||||
server->charset = SERVER_DEFAULT_CHARSET;
|
||||
server->proxy_protocol = false;
|
||||
|
||||
// Set last event to server_up as the server is in Running state on startup
|
||||
server->last_event = SERVER_UP_EVENT;
|
||||
server->triggered_at = 0;
|
||||
|
||||
spinlock_acquire(&server_spin);
|
||||
server->next = allServers;
|
||||
allServers = server;
|
||||
@ -496,6 +503,11 @@ dprintServer(DCB *dcb, const SERVER *server)
|
||||
dcb_printf(dcb, "\tServer Version: %s\n", server->version_string);
|
||||
dcb_printf(dcb, "\tNode Id: %ld\n", server->node_id);
|
||||
dcb_printf(dcb, "\tMaster Id: %ld\n", server->master_id);
|
||||
dcb_printf(dcb, "\tLast event: %s\n",
|
||||
mon_get_event_name((mxs_monitor_event_t)server->last_event));
|
||||
time_t t = maxscale_started() + (server->triggered_at / 10);
|
||||
dcb_printf(dcb, "\tTriggered at: %s\n", http_to_date(t).c_str());
|
||||
|
||||
if (server->slaves)
|
||||
{
|
||||
int i;
|
||||
@ -1451,6 +1463,11 @@ static json_t* server_json_attributes(const SERVER* server)
|
||||
json_object_set_new(attr, "master_id", json_integer(server->master_id));
|
||||
json_object_set_new(attr, "replication_depth", json_integer(server->depth));
|
||||
|
||||
const char* event_name = mon_get_event_name((mxs_monitor_event_t)server->last_event);
|
||||
time_t t = maxscale_started() + (server->triggered_at / 10);
|
||||
json_object_set_new(attr, "last_event", json_string(event_name));
|
||||
json_object_set_new(attr, "triggered_at", json_string(http_to_date(t).c_str()));
|
||||
|
||||
if (server->slaves)
|
||||
{
|
||||
json_t* slaves = json_array();
|
||||
|
Loading…
x
Reference in New Issue
Block a user