From 00feb61b2349cfa03151369c6962e8a18dd22455 Mon Sep 17 00:00:00 2001 From: Esa Korhonen Date: Fri, 30 Aug 2019 18:14:04 +0300 Subject: [PATCH 1/2] MXS-2652 Do not clear maintenance flag when a server goes down The set of flags to clear should be well-defined. --- include/maxscale/monitor.hh | 4 ++++ server/core/monitor.cc | 8 +------- server/modules/monitor/mariadbmon/mariadbmon.cc | 6 +++--- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/include/maxscale/monitor.hh b/include/maxscale/monitor.hh index 5359b09b4..49de3c0c4 100644 --- a/include/maxscale/monitor.hh +++ b/include/maxscale/monitor.hh @@ -118,6 +118,10 @@ public: static int64_t get_time_ms(); protected: + // When a monitor detects that a server is down, these bits should be cleared. + static constexpr uint64_t SERVER_DOWN_CLEAR_BITS {SERVER_RUNNING | SERVER_AUTH_ERROR | SERVER_MASTER + | SERVER_SLAVE | SERVER_SLAVE_OF_EXT_MASTER | SERVER_RELAY | SERVER_JOINED | SERVER_NDB}; + MonitorInstance(MXS_MONITOR* pMonitor); /** diff --git a/server/core/monitor.cc b/server/core/monitor.cc index 5c85a5138..3cded8eca 100644 --- a/server/core/monitor.cc +++ b/server/core/monitor.cc @@ -2839,18 +2839,12 @@ void MonitorInstanceSimple::tick() * the server state bits. This would allow clearing the state by * zeroing it out. */ - const uint64_t bits_to_clear = ~SERVER_WAS_MASTER; - - monitor_clear_pending_status(pMs, bits_to_clear); + monitor_clear_pending_status(pMs, SERVER_DOWN_CLEAR_BITS); if (mysql_errno(pMs->con) == ER_ACCESS_DENIED_ERROR) { monitor_set_pending_status(pMs, SERVER_AUTH_ERROR); } - else - { - monitor_clear_pending_status(pMs, SERVER_AUTH_ERROR); - } if (mon_status_changed(pMs) && mon_print_fail_status(pMs)) { diff --git a/server/modules/monitor/mariadbmon/mariadbmon.cc b/server/modules/monitor/mariadbmon/mariadbmon.cc index d0baf04e1..704360b6d 100644 --- a/server/modules/monitor/mariadbmon/mariadbmon.cc +++ b/server/modules/monitor/mariadbmon/mariadbmon.cc @@ -379,9 +379,9 @@ void MariaDBMonitor::update_server(MariaDBServer* server) } else { - /* The current server is not running. Clear all but the stale master bit as it is used to detect - * masters that went down but came up. */ - server->clear_status(~SERVER_WAS_MASTER); + // The server is not running. Clear some of the bits. User-set bits and some long-term bits + // can stay. + server->clear_status(SERVER_DOWN_CLEAR_BITS); auto conn_errno = mysql_errno(conn); if (conn_errno == ER_ACCESS_DENIED_ERROR || conn_errno == ER_ACCESS_DENIED_NO_PASSWORD_ERROR) { From ff2048625bab144930fa8dcc3a99c677f6f0dbf9 Mon Sep 17 00:00:00 2001 From: Esa Korhonen Date: Thu, 5 Sep 2019 09:57:44 +0300 Subject: [PATCH 2/2] MXS-2652 Add test Test case added to an existing test. --- maxscale-system-test/CMakeLists.txt | 2 +- .../mysqlmon_failover_no_slaves.cpp | 81 +++++++++++++++++-- 2 files changed, 75 insertions(+), 8 deletions(-) diff --git a/maxscale-system-test/CMakeLists.txt b/maxscale-system-test/CMakeLists.txt index 49fa2c6e5..3440f5f70 100644 --- a/maxscale-system-test/CMakeLists.txt +++ b/maxscale-system-test/CMakeLists.txt @@ -246,7 +246,7 @@ add_test_executable(mysqlmon_switchover.cpp mysqlmon_switchover mysqlmon_switcho # MySQL Monitor switchover with bad master add_test_executable(mysqlmon_switchover_bad_master.cpp mysqlmon_switchover_bad_master mysqlmon_switchover_bad_master LABELS mysqlmon REPL_BACKEND) -# MySQL Monitor manual failover with no valid slaves, uses config of mysqlmon_failover_auto +# MySQL Monitor manual failover with no valid slaves, uses config of mysqlmon_failover_auto. Also MXS-2652. add_test_executable(mysqlmon_failover_no_slaves.cpp mysqlmon_failover_no_slaves mysqlmon_failover_auto LABELS mysqlmon REPL_BACKEND) # MySQL Monitor Rejoin (good) Test diff --git a/maxscale-system-test/mysqlmon_failover_no_slaves.cpp b/maxscale-system-test/mysqlmon_failover_no_slaves.cpp index 97c22aad9..95f9755b5 100644 --- a/maxscale-system-test/mysqlmon_failover_no_slaves.cpp +++ b/maxscale-system-test/mysqlmon_failover_no_slaves.cpp @@ -11,8 +11,21 @@ * Public License. */ +// MXS-2652: https://jira.mariadb.org/browse/MXS-2652 + #include "testconnections.h" #include "fail_switch_rejoin_common.cpp" +using std::string; + +namespace +{ +void expect_maintenance(TestConnections& test, std::string server_name, bool value); +void expect_running(TestConnections& test, std::string server_name, bool value); + +const string running = "Running"; +const string down = "Down"; +const string maint = "Maintenance"; +} int main(int argc, char** argv) { @@ -38,13 +51,33 @@ int main(int argc, char** argv) test.repl->stash_server_settings(2); test.repl->disable_server_setting(2, "log-bin"); test.repl->start_node(2, (char*) ""); - // Slave 3. Stop this slave as well. The monitor is quick to turn failover off it a slave has another - // slave connection or if a slave connection is not using gtid, so those situations are hard to test. - // This may change later when failover support for such situations is added, so add a more interesting - // test for node 3 then. - test.try_query(nodes[3], "STOP SLAVE"); - test.maxscales->wait_for_monitor(); + + // Slave 3. Set node to maintenance, then shut it down. Simultaneously check issue + // MXS-2652: Maintenance flag should persist when server goes down & comes back up. + int server_ind = 3; + int server_num = server_ind + 1; + string server_name = "server" + std::to_string(server_num); + expect_maintenance(test, server_name, false); + + if (test.ok()) + { + test.maxscales->ssh_node_f(0, true, "maxadmin set server %s maintenance", server_name.c_str()); + test.maxscales->wait_for_monitor(); + expect_running(test, server_name, true); + expect_maintenance(test, server_name, true); + + test.repl->stop_node(server_ind); + test.maxscales->wait_for_monitor(); + expect_running(test, server_name, false); + expect_maintenance(test, server_name, true); + + test.repl->start_node(server_ind); + test.maxscales->wait_for_monitor(); + expect_running(test, server_name, true); + expect_maintenance(test, server_name, true); + } + get_output(test); test.tprintf(LINE); @@ -62,7 +95,41 @@ int main(int argc, char** argv) test.repl->stop_node(2); test.repl->restore_server_settings(2); test.repl->start_node(2, (char*) ""); - test.try_query(nodes[3], "START SLAVE;"); + test.maxscales->ssh_node_f(0, true, "maxadmin clear server %s maintenance", server_name.c_str()); + test.repl->fix_replication(); return test.global_result; } + +namespace +{ +void expect_running(TestConnections& test, std::string server_name, bool value) +{ + auto states = test.get_server_status(server_name.c_str()); + if (value) + { + test.expect(states.count(running) == 1, "'%s' is not running when it should be.", + server_name.c_str()); + } + else + { + test.expect(states.count(down) == 1, "'%s' is not down when it should be.", + server_name.c_str()); + } +} + +void expect_maintenance(TestConnections& test, std::string server_name, bool value) +{ + auto states = test.get_server_status(server_name.c_str()); + if (value) + { + test.expect(states.count(maint) == 1, "'%s' is not in maintenance when it should be.", + server_name.c_str()); + } + else + { + test.expect(states.count(maint) == 0, "'%s' is in maintenance when it should not be.", + server_name.c_str()); + } +} +}