From 6cafe59c76fd2e1151f2b8483402a6984ccd901b Mon Sep 17 00:00:00 2001 From: Alan Guo Xiang Tan Date: Mon, 27 May 2024 12:20:38 +0800 Subject: [PATCH] DEV: Add `DISCOURSE_DUMP_BACKTRACES_ON_UNICORN_WORKER_TIMEOUT` env (#27199) This commit adds a `DISCOURSE_DUMP_BACKTRACES_ON_UNICORN_WORKER_TIMEOUT` environment that will allow us to dump all backtraces for all threads of a Unicorn worker 2 seconds before it times out. In development, backtraces are dumped to `STDOUT` and in production we will dump it to `unicorn.stdout.log`. We want to dump all the backtraces to make it easier to identify the cause of a Unicorn worker timing out. --- config/unicorn.conf.rb | 5 +++ .../unicorn_http_server_patch.rb | 38 +++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 lib/freedom_patches/unicorn_http_server_patch.rb diff --git a/config/unicorn.conf.rb b/config/unicorn.conf.rb index 9fd348b0745..2c1e43830aa 100644 --- a/config/unicorn.conf.rb +++ b/config/unicorn.conf.rb @@ -268,4 +268,9 @@ end after_fork do |server, worker| DiscourseEvent.trigger(:web_fork_started) Discourse.after_fork + + Signal.trap("USR2") { puts <<~MSG } + [#{Time.now.utc.strftime("%Y-%m-%dT%H:%M:%S.%6N")} ##{Process.pid}] Received USR2 signal, dumping backtrace for all threads + #{Thread.list.map { |t| "#{t.backtrace&.join("\n")}" }.join("\n\n")} + MSG end diff --git a/lib/freedom_patches/unicorn_http_server_patch.rb b/lib/freedom_patches/unicorn_http_server_patch.rb new file mode 100644 index 00000000000..6edb04154c5 --- /dev/null +++ b/lib/freedom_patches/unicorn_http_server_patch.rb @@ -0,0 +1,38 @@ +# frozen_string_literal: true + +if ENV["DISCOURSE_DUMP_BACKTRACES_ON_UNICORN_WORKER_TIMEOUT"] && defined?(Unicorn::HttpServer) + module UnicornHTTPServerPatch + # Original source: https://github.com/defunkt/unicorn/blob/6c9c442fb6aa12fd871237bc2bb5aec56c5b3538/lib/unicorn/http_server.rb#L477-L496 + def murder_lazy_workers + next_sleep = @timeout - 1 + now = time_now.to_i + @workers.dup.each_pair do |wpid, worker| + tick = worker.tick + 0 == tick and next # skip workers that haven't processed any clients + diff = now - tick + tmp = @timeout - diff + + # START MONKEY PATCH + if tmp < 2 + logger.error "worker=#{worker.nr} PID:#{wpid} running too long " \ + "(#{diff}s), sending USR2 to dump thread backtraces" + kill_worker(:USR2, wpid) + end + # END MONKEY PATCH + + if tmp >= 0 + next_sleep > tmp and next_sleep = tmp + next + end + next_sleep = 0 + logger.error "worker=#{worker.nr} PID:#{wpid} timeout " \ + "(#{diff}s > #{@timeout}s), killing" + + kill_worker(:KILL, wpid) # take no prisoners for timeout violations + end + next_sleep <= 0 ? 1 : next_sleep + end + end + + Unicorn::HttpServer.prepend(UnicornHTTPServerPatch) +end