From 2d1e5f46fad9a7cb8e20fbdaa31b7ee977a797a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Markus=20M=C3=A4kel=C3=A4?= Date: Fri, 27 Oct 2017 10:26:10 +0300 Subject: [PATCH] Remove use of timestamps in failover code Using timestamps to detect whether MaxScale was active or passive can cause problems if multiple events happen at the same time. This can be avoided by separating events into actively observed and passively observed events. This clarifies the logic by removing the ambiguity of timestamps. As the monitoring threads are separate from the worker threads, it is prudent to use atomic operations to modify and read the state of the MaxScale. This will impose an happens-before relation between MaxScale being set into passive mode and events being classified as being passively observed. --- include/maxscale/config.h | 2 +- include/maxscale/monitor.h | 3 +- include/maxscale/server.h | 1 + server/core/config_runtime.cc | 4 +- server/core/monitor.cc | 5 +- server/modules/monitor/mysqlmon/mysql_mon.cc | 53 ++++++++------------ 6 files changed, 29 insertions(+), 39 deletions(-) diff --git a/include/maxscale/config.h b/include/maxscale/config.h index d727cf648..ee3868a70 100644 --- a/include/maxscale/config.h +++ b/include/maxscale/config.h @@ -212,7 +212,7 @@ typedef struct unsigned int auth_read_timeout; /**< Read timeout for the user authentication */ unsigned int auth_write_timeout; /**< Write timeout for the user authentication */ bool skip_permission_checks; /**< Skip service and monitor permission checks */ - bool passive; /**< True if MaxScale is in passive mode */ + int32_t passive; /**< True if MaxScale is in passive mode */ int64_t promoted_at; /**< Time when this Maxscale instance was * promoted from a passive to an active */ char qc_name[PATH_MAX]; /**< The name of the query classifier to load */ diff --git a/include/maxscale/monitor.h b/include/maxscale/monitor.h index f8d1de5c0..343f51124 100644 --- a/include/maxscale/monitor.h +++ b/include/maxscale/monitor.h @@ -205,8 +205,7 @@ struct mxs_monitor bool active; /**< True if monitor is active */ time_t journal_max_age; /**< Maximum age of journal file */ uint32_t script_timeout; /**< Timeout in seconds for the monitor scripts */ - int64_t last_master_up; /**< Time when the last master_up event was triggered */ - int64_t last_master_down; /**< Time when the last master_down event was triggered */ + bool master_has_failed; /**< Set to true when the latest event is a master_down event */ struct mxs_monitor *next; /**< Next monitor in the linked list */ }; diff --git a/include/maxscale/server.h b/include/maxscale/server.h index 457541097..10c7dbc88 100644 --- a/include/maxscale/server.h +++ b/include/maxscale/server.h @@ -136,6 +136,7 @@ typedef struct server bool is_active; /**< Server is active and has not been "destroyed" */ bool proxy_protocol; /**< Send proxy-protocol header to backend when connecting client sessions. */ int last_event; /**< The last event that occurred on this server */ + bool active_event; /**< Event observed when MaxScale was active */ int64_t triggered_at; /**< Time when the last event was triggered */ #if defined(SS_DEBUG) skygw_chk_t server_chk_tail; diff --git a/server/core/config_runtime.cc b/server/core/config_runtime.cc index 332ecdd56..623bcceed 100644 --- a/server/core/config_runtime.cc +++ b/server/core/config_runtime.cc @@ -763,10 +763,10 @@ bool runtime_alter_maxscale(const char* name, const char* value) if (cnf.passive && !boolval) { // This MaxScale is being promoted to the active instance - cnf.promoted_at = hkheartbeat; + atomic_store_int64(&cnf.promoted_at, hkheartbeat); } - cnf.passive = boolval; + atomic_store_int32(&cnf.passive, boolval); rval = true; } else diff --git a/server/core/monitor.cc b/server/core/monitor.cc index 850133390..190836a1f 100644 --- a/server/core/monitor.cc +++ b/server/core/monitor.cc @@ -1755,16 +1755,17 @@ void mon_process_state_changes(MXS_MONITOR *monitor, const char *script, uint64_ mxs_monitor_event_t event = mon_get_event_type(ptr); ptr->server->last_event = event; ptr->server->triggered_at = hkheartbeat; + ptr->server->active_event = !atomic_load_int32(&config_get_global_options()->passive); ptr->new_event = true; mon_log_state_change(ptr); if (event == MASTER_DOWN_EVENT) { - monitor->last_master_down = hkheartbeat; + monitor->master_has_failed = true; } else if (event == MASTER_UP_EVENT || event == NEW_MASTER_EVENT) { - monitor->last_master_up = hkheartbeat; + monitor->master_has_failed = false; } if (script && (events & event)) diff --git a/server/modules/monitor/mysqlmon/mysql_mon.cc b/server/modules/monitor/mysqlmon/mysql_mon.cc index dbfd4596c..7f4378f4c 100644 --- a/server/modules/monitor/mysqlmon/mysql_mon.cc +++ b/server/modules/monitor/mysqlmon/mysql_mon.cc @@ -2892,43 +2892,32 @@ bool mon_process_failover(MYSQL_MONITOR* monitor, const char* failover_script, u for (MXS_MONITORED_SERVER *ptr = monitor->monitor->monitored_servers; ptr; ptr = ptr->next) { - if (mon_status_changed(ptr)) + if (ptr->new_event && !cnf->passive && + ptr->server->last_event == MASTER_DOWN_EVENT) { - if (ptr->server->last_event == MASTER_DOWN_EVENT) + if (failed_master) { - if (!cnf->passive) - { - if (failed_master) - { - MXS_ALERT("Multiple failed master servers detected: " - "'%s' is the first master to fail but server " - "'%s' has also triggered a master_down event.", - failed_master->server->unique_name, - ptr->server->unique_name); - return false; - } - else - { - failed_master = ptr; - } - } + MXS_ALERT("Multiple failed master servers detected: " + "'%s' is the first master to fail but server " + "'%s' has also triggered a master_down event.", + failed_master->server->unique_name, + ptr->server->unique_name); + return false; } - } - else - { - /** - * If a master_down event was triggered when this MaxScale was - * passive, we need to execute the failover script again if no new - * masters have appeared and this MaxScale has been set as active - * since the event took place. - */ - if (!cnf->passive && // This is not a passive MaxScale - ptr->server->last_event == MASTER_DOWN_EVENT && // This is a master that went down - cnf->promoted_at >= ptr->server->triggered_at && // Promoted to active after the event took place - ptr->new_event && // Event has not yet been processed - monitor->monitor->last_master_down > monitor->monitor->last_master_up) // Latest relevant event + if (ptr->server->active_event) { + // MaxScale was active when the event took place + failed_master = ptr; + ptr->new_event = false; + } + else if (monitor->monitor->master_has_failed) + { + /** + * If a master_down event was triggered when this MaxScale was + * passive, we need to execute the failover script again if no new + * masters have appeared. + */ int64_t timeout = SEC_TO_HB(failover_timeout); int64_t t = hkheartbeat - ptr->server->triggered_at;