From f29e5b65de1d8f74c28e6fd314b043e4218e5fcd Mon Sep 17 00:00:00 2001 From: Niclas Antti Date: Fri, 9 Nov 2018 10:18:22 +0200 Subject: [PATCH] MXS-2057 systemd watchdog Systemd wathdog notification at a little more than 2/3 of the systemd configured time. In the service config (maxscale.service) add e.g. WatchdogSec=30s to set and enable the watchdog. For building: install libsystemd-dev. The next commit will modify cmake configuration and code to conditionally compile the new code based on existence of libsystemd-dev. --- include/maxscale/routingworker.hh | 11 +++++ maxutils/maxbase/src/CMakeLists.txt | 4 ++ server/core/gateway.cc | 8 ++++ server/core/routingworker.cc | 71 ++++++++++++++++++++++++++++- 4 files changed, 93 insertions(+), 1 deletion(-) diff --git a/include/maxscale/routingworker.hh b/include/maxscale/routingworker.hh index a05fb3743..19dfe2438 100644 --- a/include/maxscale/routingworker.hh +++ b/include/maxscale/routingworker.hh @@ -18,10 +18,12 @@ #include #include #include +#include #include #include #include +#include #include #include #include @@ -426,6 +428,10 @@ public: */ static std::unique_ptr get_qc_stats_as_json(const char* zHost, int id); + /** + * To be called from the initial (parent) thread if the systemd watchdog is on. + */ + static void set_watchdog_interval(uint64_t microseconds); private: const int m_id; /*< The id of the worker. */ SessionsById m_sessions; /*< A mapping of session_id->MXS_SESSION. The map @@ -447,9 +453,14 @@ private: void epoll_tick(); // override void delete_zombies(); + void check_systemd_watchdog(); static uint32_t epoll_instance_handler(MXB_POLL_DATA* data, MXB_WORKER* worker, uint32_t events); uint32_t handle_epoll_events(uint32_t events); + + static maxbase::Duration s_watchdog_interval; /*< Duration between notifications, if any. */ + static maxbase::TimePoint s_watchdog_next_check;/*< Next time to notify systemd. */ + std::atomic m_alive; /*< Set to true in epoll_tick(), false on notification. */ }; // Data local to a routing worker diff --git a/maxutils/maxbase/src/CMakeLists.txt b/maxutils/maxbase/src/CMakeLists.txt index 0e958370e..60ca60b8b 100644 --- a/maxutils/maxbase/src/CMakeLists.txt +++ b/maxutils/maxbase/src/CMakeLists.txt @@ -15,5 +15,9 @@ add_library(maxbase STATIC average.cc ) +target_link_libraries(maxbase + systemd +) + set_target_properties(maxbase PROPERTIES VERSION "1.0.0" LINK_FLAGS -Wl,-z,defs) add_subdirectory(test) diff --git a/server/core/gateway.cc b/server/core/gateway.cc index e0559f84e..d53c931ba 100644 --- a/server/core/gateway.cc +++ b/server/core/gateway.cc @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -1768,6 +1769,13 @@ int main(int argc, char** argv) cnf->log_target = MXB_LOG_TARGET_STDOUT; } + // Systemd watchdog. Must be called in the initial thread */ + uint64_t systemd_interval; // in microseconds + if (sd_watchdog_enabled(false, &systemd_interval) > 0) + { + RoutingWorker::set_watchdog_interval(systemd_interval); + } + if (!daemon_mode) { fprintf(stderr, diff --git a/server/core/routingworker.cc b/server/core/routingworker.cc index 88a749fe0..e42c0ec7a 100644 --- a/server/core/routingworker.cc +++ b/server/core/routingworker.cc @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -163,8 +164,14 @@ void modules_thread_finish() namespace maxscale { +// static +maxbase::Duration RoutingWorker::s_watchdog_interval {0}; +// static +maxbase::TimePoint RoutingWorker::s_watchdog_next_check; + RoutingWorker::RoutingWorker() : m_id(next_worker_id()) + , m_alive(true) { MXB_POLL_DATA::handler = &RoutingWorker::epoll_instance_handler; MXB_POLL_DATA::owner = this; @@ -266,6 +273,12 @@ bool RoutingWorker::init() // bofore the workes have been started) will be handled by the worker // that will be running in the main thread. this_thread.current_worker_id = 0; + + if (s_watchdog_interval.count() != 0) + { + MXS_NOTICE("The systemd watchdog is Enabled. Internal timeout = %s\n", + to_string(s_watchdog_interval).c_str()); + } } return this_unit.initialized; @@ -535,6 +548,8 @@ void RoutingWorker::epoll_tick() m_state = ZPROCESSING; delete_zombies(); + + check_systemd_watchdog(); } /** @@ -964,6 +979,61 @@ RoutingWorker* RoutingWorker::pick_worker() + (mxb::atomic::add(&id_generator, 1, mxb::atomic::RELAXED) % this_unit.nWorkers); return get(id); } + +// static +void maxscale::RoutingWorker::set_watchdog_interval(uint64_t microseconds) +{ + // Do not call anything from here, assume nothing has been initialized (like logging). + + // The internal timeout is 2/3 of the systemd configured interval. + double seconds = 2.0 * microseconds / 3000000; + + s_watchdog_interval = maxbase::Duration(seconds); + s_watchdog_next_check = maxbase::Clock::now(); +} + +// A note about the below code. While the main worker is turning the "m_alive" values to false, +// it is a possibility that another RoutingWorker sees the old value of "s_watchdog_next_check" +// but its new "m_alive==false" value, marks itself alive and promptly hangs. This would cause a +// watchdog kill delay of about "s_watchdog_interval" time. +// Release-acquire would fix that, but is an unneccesary expense. +void RoutingWorker::check_systemd_watchdog() +{ + if (s_watchdog_interval.count() == 0) // not turned on + { + return; + } + + maxbase::TimePoint now = maxbase::Clock::now(); + if (now > s_watchdog_next_check) + { + if (m_id == this_unit.id_main_worker) + { + m_alive.store(true, std::memory_order_relaxed); + bool all_alive = std::all_of(this_unit.ppWorkers, this_unit.ppWorkers + this_unit.nWorkers, + [](RoutingWorker* rw) { + return rw->m_alive.load(std::memory_order_relaxed); + }); + if (all_alive) + { + s_watchdog_next_check = now + s_watchdog_interval; + MXS_NOTICE("sd_notify\n"); + sd_notify(false, "WATCHDOG=1"); + std::for_each(this_unit.ppWorkers, this_unit.ppWorkers + this_unit.nWorkers, + [](RoutingWorker* rw) { + rw->m_alive.store(false, std::memory_order_relaxed); + }); + } + } + else + { + if (m_alive.load(std::memory_order_relaxed) == false) + { + m_alive.store(true, std::memory_order_relaxed); + } + } + } +} } size_t mxs_rworker_broadcast_message(uint32_t msg_id, intptr_t arg1, intptr_t arg2) @@ -1183,7 +1253,6 @@ public: // Success if this is called. } }; - } void mxs_rworker_watchdog()