From bfe502012d52b1f61617472bf3f032fe1e2bc6da Mon Sep 17 00:00:00 2001 From: Sam Date: Wed, 3 Aug 2022 12:53:26 +1000 Subject: [PATCH] FEATURE: track stats around failing scheduled jobs (#17769) * FEATURE: track stats around failing scheduled jobs Discourse.job_exception_stats can now be used to gather stats around how many regular scheduled jobs failed in the current process. This will be consumed by the Prometheus plugin and potentially other monitoring plugins. --- lib/discourse.rb | 15 +++++++++++++++ spec/lib/discourse_spec.rb | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+) diff --git a/lib/discourse.rb b/lib/discourse.rb index 4616b887d2d..6891f83789a 100644 --- a/lib/discourse.rb +++ b/lib/discourse.rb @@ -153,6 +153,16 @@ module Discourse end end + def self.job_exception_stats + @job_exception_stats + end + + def self.reset_job_exception_stats! + @job_exception_stats = Hash.new(0) + end + + reset_job_exception_stats! + # Log an exception. # # If your code is in a scheduled job, it is recommended to use the @@ -165,6 +175,11 @@ module Discourse context ||= {} parent_logger ||= Sidekiq + job = context.dig(:job, "class") + if job + job_exception_stats[job] += 1 + end + cm = RailsMultisite::ConnectionManagement parent_logger.handle_exception(ex, { current_db: cm.current_db, diff --git a/spec/lib/discourse_spec.rb b/spec/lib/discourse_spec.rb index 3d1401d0443..783958939c0 100644 --- a/spec/lib/discourse_spec.rb +++ b/spec/lib/discourse_spec.rb @@ -340,6 +340,44 @@ RSpec.describe Discourse do Sidekiq.error_handlers.delete(logger) end + describe "#job_exception_stats" do + + before do + Discourse.reset_job_exception_stats! + end + + after do + Discourse.reset_job_exception_stats! + end + + it "should collect job exception stats" do + + # see MiniScheduler Manager which reports it like this + # https://github.com/discourse/mini_scheduler/blob/2b2c1c56b6e76f51108c2a305775469e24cf2b65/lib/mini_scheduler/manager.rb#L95 + exception_context = { + message: "Running a scheduled job", + job: { "class" => Jobs::ReindexSearch } + } + + # re-raised unconditionally in test env + 2.times do + expect { Discourse.handle_job_exception(StandardError.new, exception_context) }.to raise_error(StandardError) + end + + exception_context = { + message: "Running a scheduled job", + job: { "class" => Jobs::PollMailbox } + } + + expect { Discourse.handle_job_exception(StandardError.new, exception_context) }.to raise_error(StandardError) + + expect(Discourse.job_exception_stats).to eq({ + Jobs::PollMailbox => 1, + Jobs::ReindexSearch => 2, + }) + end + end + it "should not fail when called" do exception = StandardError.new