From 361992bb74768667bdd9e78fb1d0f47f87f211a3 Mon Sep 17 00:00:00 2001 From: Osama Sayegh Date: Mon, 27 May 2024 16:26:35 +0300 Subject: [PATCH] FIX: Apply crawler rate limits to cached requests (#27174) This commit moves the logic for crawler rate limits out of the application controller and into the request tracker middleware. The reason for this move is to apply rate limits to all crawler requests instead of just the requests that make it to the application controller. Some requests are served early from the middleware stack without reaching the Rails app for performance reasons (e.g. `AnonymousCache`) which results in crawlers getting 200 responses even though they've reached their limits and should be getting 429 responses. Internal topic: t/128810. --- app/controllers/application_controller.rb | 23 ----------- lib/middleware/request_tracker.rb | 40 ++++++++++++++++++++ spec/lib/middleware/request_tracker_spec.rb | 25 ++++++++++++ spec/requests/application_controller_spec.rb | 25 ++++++++++-- 4 files changed, 87 insertions(+), 26 deletions(-) diff --git a/app/controllers/application_controller.rb b/app/controllers/application_controller.rb index 16ace3d8bb3..7406d3d4c18 100644 --- a/app/controllers/application_controller.rb +++ b/app/controllers/application_controller.rb @@ -29,7 +29,6 @@ class ApplicationController < ActionController::Base end end - before_action :rate_limit_crawlers before_action :check_readonly_mode before_action :handle_theme before_action :set_current_user_for_logs @@ -1053,28 +1052,6 @@ class ApplicationController < ActionController::Base Theme.where(id: ids).pluck(:id, :name).to_h.to_json end - def rate_limit_crawlers - return if current_user.present? - return if SiteSetting.slow_down_crawler_user_agents.blank? - - user_agent = request.user_agent&.downcase - return if user_agent.blank? - - SiteSetting - .slow_down_crawler_user_agents - .downcase - .split("|") - .each do |crawler| - if user_agent.include?(crawler) - key = "#{crawler}_crawler_rate_limit" - limiter = - RateLimiter.new(nil, key, 1, SiteSetting.slow_down_crawler_rate, error_code: key) - limiter.performed! - break - end - end - end - def run_second_factor!(action_class, action_data: nil, target_user: current_user) if current_user && target_user != current_user # Anon can run 2fa against another target, but logged-in users should not. diff --git a/lib/middleware/request_tracker.rb b/lib/middleware/request_tracker.rb index db910821784..43d411e32c9 100644 --- a/lib/middleware/request_tracker.rb +++ b/lib/middleware/request_tracker.rb @@ -267,6 +267,20 @@ class Middleware::RequestTracker end return 429, headers, [message] end + + if !cookie + if error_details = check_crawler_limits(env) + available_in, error_code = error_details + message = "Too many crawling requests. Error code: #{error_code}." + headers = { + "Content-Type" => "text/plain", + "Retry-After" => available_in.to_s, + "Discourse-Rate-Limit-Error-Code" => error_code, + } + return 429, headers, [message] + end + end + env["discourse.request_tracker"] = self MethodProfiler.start @@ -443,4 +457,30 @@ class Middleware::RequestTracker end end end + + def check_crawler_limits(env) + slow_down_agents = SiteSetting.slow_down_crawler_user_agents + return if slow_down_agents.blank? + + user_agent = env["HTTP_USER_AGENT"]&.downcase + return if user_agent.blank? + + return if !CrawlerDetection.crawler?(user_agent) + + slow_down_agents + .downcase + .split("|") + .each do |crawler| + if user_agent.include?(crawler) + key = "#{crawler}_crawler_rate_limit" + limiter = + RateLimiter.new(nil, key, 1, SiteSetting.slow_down_crawler_rate, error_code: key) + limiter.performed! + break + end + end + nil + rescue RateLimiter::LimitExceeded => e + [e.available_in, e.error_code] + end end diff --git a/spec/lib/middleware/request_tracker_spec.rb b/spec/lib/middleware/request_tracker_spec.rb index 0004c385645..190f97e9c95 100644 --- a/spec/lib/middleware/request_tracker_spec.rb +++ b/spec/lib/middleware/request_tracker_spec.rb @@ -369,6 +369,31 @@ RSpec.describe Middleware::RequestTracker do end end + describe "crawler rate limits" do + context "when there are multiple matching crawlers" do + before { SiteSetting.slow_down_crawler_user_agents = "badcrawler2|badcrawler22" } + + it "only checks limits for the first match" do + env = env("HTTP_USER_AGENT" => "badcrawler") + + status, _ = middleware.call(env) + expect(status).to eq(200) + end + end + + it "compares user agents in a case-insensitive manner" do + SiteSetting.slow_down_crawler_user_agents = "BaDCRawLer" + env1 = env("HTTP_USER_AGENT" => "bADcrAWLer") + env2 = env("HTTP_USER_AGENT" => "bADcrAWLer") + + status, _ = middleware.call(env1) + expect(status).to eq(200) + + status, _ = middleware.call(env2) + expect(status).to eq(429) + end + end + describe "register_ip_skipper" do before do Middleware::RequestTracker.register_ip_skipper { |ip| ip == "1.1.1.2" } diff --git a/spec/requests/application_controller_spec.rb b/spec/requests/application_controller_spec.rb index 56f414ad0cc..621db5936df 100644 --- a/spec/requests/application_controller_spec.rb +++ b/spec/requests/application_controller_spec.rb @@ -1103,13 +1103,17 @@ RSpec.describe ApplicationController do end describe "crawlers in slow_down_crawler_user_agents site setting" do - before { RateLimiter.enable } + before do + Fabricate(:admin) # to prevent redirect to the wizard + RateLimiter.enable + + SiteSetting.slow_down_crawler_rate = 128 + SiteSetting.slow_down_crawler_user_agents = "badcrawler|problematiccrawler" + end use_redis_snapshotting it "are rate limited" do - SiteSetting.slow_down_crawler_rate = 128 - SiteSetting.slow_down_crawler_user_agents = "badcrawler|problematiccrawler" now = Time.zone.now freeze_time now @@ -1141,6 +1145,21 @@ RSpec.describe ApplicationController do get "/", headers: { "HTTP_USER_AGENT" => "iam problematiccrawler" } expect(response.status).to eq(200) end + + context "with anonymous caching" do + before do + global_setting :anon_cache_store_threshold, 1 + Middleware::AnonymousCache.enable_anon_cache + end + + it "don't bypass crawler rate limits" do + get "/", headers: { "HTTP_USER_AGENT" => "iam badcrawler" } + expect(response.status).to eq(200) + + get "/", headers: { "HTTP_USER_AGENT" => "iam badcrawler" } + expect(response.status).to eq(429) + end + end end describe "#banner_json" do