MXS-2057 systemd watchdog
Systemd wathdog notification at a little more than 2/3 of the systemd configured time. In the service config (maxscale.service) add e.g. WatchdogSec=30s to set and enable the watchdog. For building: install libsystemd-dev. The next commit will modify cmake configuration and code to conditionally compile the new code based on existence of libsystemd-dev.
This commit is contained in:
@ -18,10 +18,12 @@
|
|||||||
#include <vector>
|
#include <vector>
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
#include <type_traits>
|
#include <type_traits>
|
||||||
|
#include <atomic>
|
||||||
|
|
||||||
#include <maxbase/atomic.hh>
|
#include <maxbase/atomic.hh>
|
||||||
#include <maxbase/semaphore.hh>
|
#include <maxbase/semaphore.hh>
|
||||||
#include <maxbase/worker.hh>
|
#include <maxbase/worker.hh>
|
||||||
|
#include <maxbase/stopwatch.hh>
|
||||||
#include <maxscale/poll.h>
|
#include <maxscale/poll.h>
|
||||||
#include <maxscale/query_classifier.h>
|
#include <maxscale/query_classifier.h>
|
||||||
#include <maxscale/routingworker.h>
|
#include <maxscale/routingworker.h>
|
||||||
@ -426,6 +428,10 @@ public:
|
|||||||
*/
|
*/
|
||||||
static std::unique_ptr<json_t> get_qc_stats_as_json(const char* zHost, int id);
|
static std::unique_ptr<json_t> get_qc_stats_as_json(const char* zHost, int id);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* To be called from the initial (parent) thread if the systemd watchdog is on.
|
||||||
|
*/
|
||||||
|
static void set_watchdog_interval(uint64_t microseconds);
|
||||||
private:
|
private:
|
||||||
const int m_id; /*< The id of the worker. */
|
const int m_id; /*< The id of the worker. */
|
||||||
SessionsById m_sessions; /*< A mapping of session_id->MXS_SESSION. The map
|
SessionsById m_sessions; /*< A mapping of session_id->MXS_SESSION. The map
|
||||||
@ -447,9 +453,14 @@ private:
|
|||||||
void epoll_tick(); // override
|
void epoll_tick(); // override
|
||||||
|
|
||||||
void delete_zombies();
|
void delete_zombies();
|
||||||
|
void check_systemd_watchdog();
|
||||||
|
|
||||||
static uint32_t epoll_instance_handler(MXB_POLL_DATA* data, MXB_WORKER* worker, uint32_t events);
|
static uint32_t epoll_instance_handler(MXB_POLL_DATA* data, MXB_WORKER* worker, uint32_t events);
|
||||||
uint32_t handle_epoll_events(uint32_t events);
|
uint32_t handle_epoll_events(uint32_t events);
|
||||||
|
|
||||||
|
static maxbase::Duration s_watchdog_interval; /*< Duration between notifications, if any. */
|
||||||
|
static maxbase::TimePoint s_watchdog_next_check;/*< Next time to notify systemd. */
|
||||||
|
std::atomic<bool> m_alive; /*< Set to true in epoll_tick(), false on notification. */
|
||||||
};
|
};
|
||||||
|
|
||||||
// Data local to a routing worker
|
// Data local to a routing worker
|
||||||
|
|||||||
@ -15,5 +15,9 @@ add_library(maxbase STATIC
|
|||||||
average.cc
|
average.cc
|
||||||
)
|
)
|
||||||
|
|
||||||
|
target_link_libraries(maxbase
|
||||||
|
systemd
|
||||||
|
)
|
||||||
|
|
||||||
set_target_properties(maxbase PROPERTIES VERSION "1.0.0" LINK_FLAGS -Wl,-z,defs)
|
set_target_properties(maxbase PROPERTIES VERSION "1.0.0" LINK_FLAGS -Wl,-z,defs)
|
||||||
add_subdirectory(test)
|
add_subdirectory(test)
|
||||||
|
|||||||
@ -27,6 +27,7 @@
|
|||||||
#include <time.h>
|
#include <time.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#include <getopt.h>
|
#include <getopt.h>
|
||||||
|
#include <systemd/sd-daemon.h>
|
||||||
|
|
||||||
#include <set>
|
#include <set>
|
||||||
#include <map>
|
#include <map>
|
||||||
@ -1768,6 +1769,13 @@ int main(int argc, char** argv)
|
|||||||
cnf->log_target = MXB_LOG_TARGET_STDOUT;
|
cnf->log_target = MXB_LOG_TARGET_STDOUT;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Systemd watchdog. Must be called in the initial thread */
|
||||||
|
uint64_t systemd_interval; // in microseconds
|
||||||
|
if (sd_watchdog_enabled(false, &systemd_interval) > 0)
|
||||||
|
{
|
||||||
|
RoutingWorker::set_watchdog_interval(systemd_interval);
|
||||||
|
}
|
||||||
|
|
||||||
if (!daemon_mode)
|
if (!daemon_mode)
|
||||||
{
|
{
|
||||||
fprintf(stderr,
|
fprintf(stderr,
|
||||||
|
|||||||
@ -18,6 +18,7 @@
|
|||||||
#include <signal.h>
|
#include <signal.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
|
#include <systemd/sd-daemon.h>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
|
|
||||||
@ -163,8 +164,14 @@ void modules_thread_finish()
|
|||||||
namespace maxscale
|
namespace maxscale
|
||||||
{
|
{
|
||||||
|
|
||||||
|
// static
|
||||||
|
maxbase::Duration RoutingWorker::s_watchdog_interval {0};
|
||||||
|
// static
|
||||||
|
maxbase::TimePoint RoutingWorker::s_watchdog_next_check;
|
||||||
|
|
||||||
RoutingWorker::RoutingWorker()
|
RoutingWorker::RoutingWorker()
|
||||||
: m_id(next_worker_id())
|
: m_id(next_worker_id())
|
||||||
|
, m_alive(true)
|
||||||
{
|
{
|
||||||
MXB_POLL_DATA::handler = &RoutingWorker::epoll_instance_handler;
|
MXB_POLL_DATA::handler = &RoutingWorker::epoll_instance_handler;
|
||||||
MXB_POLL_DATA::owner = this;
|
MXB_POLL_DATA::owner = this;
|
||||||
@ -266,6 +273,12 @@ bool RoutingWorker::init()
|
|||||||
// bofore the workes have been started) will be handled by the worker
|
// bofore the workes have been started) will be handled by the worker
|
||||||
// that will be running in the main thread.
|
// that will be running in the main thread.
|
||||||
this_thread.current_worker_id = 0;
|
this_thread.current_worker_id = 0;
|
||||||
|
|
||||||
|
if (s_watchdog_interval.count() != 0)
|
||||||
|
{
|
||||||
|
MXS_NOTICE("The systemd watchdog is Enabled. Internal timeout = %s\n",
|
||||||
|
to_string(s_watchdog_interval).c_str());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return this_unit.initialized;
|
return this_unit.initialized;
|
||||||
@ -535,6 +548,8 @@ void RoutingWorker::epoll_tick()
|
|||||||
m_state = ZPROCESSING;
|
m_state = ZPROCESSING;
|
||||||
|
|
||||||
delete_zombies();
|
delete_zombies();
|
||||||
|
|
||||||
|
check_systemd_watchdog();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -964,6 +979,61 @@ RoutingWorker* RoutingWorker::pick_worker()
|
|||||||
+ (mxb::atomic::add(&id_generator, 1, mxb::atomic::RELAXED) % this_unit.nWorkers);
|
+ (mxb::atomic::add(&id_generator, 1, mxb::atomic::RELAXED) % this_unit.nWorkers);
|
||||||
return get(id);
|
return get(id);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// static
|
||||||
|
void maxscale::RoutingWorker::set_watchdog_interval(uint64_t microseconds)
|
||||||
|
{
|
||||||
|
// Do not call anything from here, assume nothing has been initialized (like logging).
|
||||||
|
|
||||||
|
// The internal timeout is 2/3 of the systemd configured interval.
|
||||||
|
double seconds = 2.0 * microseconds / 3000000;
|
||||||
|
|
||||||
|
s_watchdog_interval = maxbase::Duration(seconds);
|
||||||
|
s_watchdog_next_check = maxbase::Clock::now();
|
||||||
|
}
|
||||||
|
|
||||||
|
// A note about the below code. While the main worker is turning the "m_alive" values to false,
|
||||||
|
// it is a possibility that another RoutingWorker sees the old value of "s_watchdog_next_check"
|
||||||
|
// but its new "m_alive==false" value, marks itself alive and promptly hangs. This would cause a
|
||||||
|
// watchdog kill delay of about "s_watchdog_interval" time.
|
||||||
|
// Release-acquire would fix that, but is an unneccesary expense.
|
||||||
|
void RoutingWorker::check_systemd_watchdog()
|
||||||
|
{
|
||||||
|
if (s_watchdog_interval.count() == 0) // not turned on
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
maxbase::TimePoint now = maxbase::Clock::now();
|
||||||
|
if (now > s_watchdog_next_check)
|
||||||
|
{
|
||||||
|
if (m_id == this_unit.id_main_worker)
|
||||||
|
{
|
||||||
|
m_alive.store(true, std::memory_order_relaxed);
|
||||||
|
bool all_alive = std::all_of(this_unit.ppWorkers, this_unit.ppWorkers + this_unit.nWorkers,
|
||||||
|
[](RoutingWorker* rw) {
|
||||||
|
return rw->m_alive.load(std::memory_order_relaxed);
|
||||||
|
});
|
||||||
|
if (all_alive)
|
||||||
|
{
|
||||||
|
s_watchdog_next_check = now + s_watchdog_interval;
|
||||||
|
MXS_NOTICE("sd_notify\n");
|
||||||
|
sd_notify(false, "WATCHDOG=1");
|
||||||
|
std::for_each(this_unit.ppWorkers, this_unit.ppWorkers + this_unit.nWorkers,
|
||||||
|
[](RoutingWorker* rw) {
|
||||||
|
rw->m_alive.store(false, std::memory_order_relaxed);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (m_alive.load(std::memory_order_relaxed) == false)
|
||||||
|
{
|
||||||
|
m_alive.store(true, std::memory_order_relaxed);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t mxs_rworker_broadcast_message(uint32_t msg_id, intptr_t arg1, intptr_t arg2)
|
size_t mxs_rworker_broadcast_message(uint32_t msg_id, intptr_t arg1, intptr_t arg2)
|
||||||
@ -1183,7 +1253,6 @@ public:
|
|||||||
// Success if this is called.
|
// Success if this is called.
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void mxs_rworker_watchdog()
|
void mxs_rworker_watchdog()
|
||||||
|
|||||||
Reference in New Issue
Block a user