diff --git a/app/controllers/application_controller.rb b/app/controllers/application_controller.rb index 16ace3d8bb3..7406d3d4c18 100644 --- a/app/controllers/application_controller.rb +++ b/app/controllers/application_controller.rb @@ -29,7 +29,6 @@ class ApplicationController < ActionController::Base end end - before_action :rate_limit_crawlers before_action :check_readonly_mode before_action :handle_theme before_action :set_current_user_for_logs @@ -1053,28 +1052,6 @@ class ApplicationController < ActionController::Base Theme.where(id: ids).pluck(:id, :name).to_h.to_json end - def rate_limit_crawlers - return if current_user.present? - return if SiteSetting.slow_down_crawler_user_agents.blank? - - user_agent = request.user_agent&.downcase - return if user_agent.blank? - - SiteSetting - .slow_down_crawler_user_agents - .downcase - .split("|") - .each do |crawler| - if user_agent.include?(crawler) - key = "#{crawler}_crawler_rate_limit" - limiter = - RateLimiter.new(nil, key, 1, SiteSetting.slow_down_crawler_rate, error_code: key) - limiter.performed! - break - end - end - end - def run_second_factor!(action_class, action_data: nil, target_user: current_user) if current_user && target_user != current_user # Anon can run 2fa against another target, but logged-in users should not. diff --git a/lib/middleware/request_tracker.rb b/lib/middleware/request_tracker.rb index db910821784..43d411e32c9 100644 --- a/lib/middleware/request_tracker.rb +++ b/lib/middleware/request_tracker.rb @@ -267,6 +267,20 @@ class Middleware::RequestTracker end return 429, headers, [message] end + + if !cookie + if error_details = check_crawler_limits(env) + available_in, error_code = error_details + message = "Too many crawling requests. Error code: #{error_code}." + headers = { + "Content-Type" => "text/plain", + "Retry-After" => available_in.to_s, + "Discourse-Rate-Limit-Error-Code" => error_code, + } + return 429, headers, [message] + end + end + env["discourse.request_tracker"] = self MethodProfiler.start @@ -443,4 +457,30 @@ class Middleware::RequestTracker end end end + + def check_crawler_limits(env) + slow_down_agents = SiteSetting.slow_down_crawler_user_agents + return if slow_down_agents.blank? + + user_agent = env["HTTP_USER_AGENT"]&.downcase + return if user_agent.blank? + + return if !CrawlerDetection.crawler?(user_agent) + + slow_down_agents + .downcase + .split("|") + .each do |crawler| + if user_agent.include?(crawler) + key = "#{crawler}_crawler_rate_limit" + limiter = + RateLimiter.new(nil, key, 1, SiteSetting.slow_down_crawler_rate, error_code: key) + limiter.performed! + break + end + end + nil + rescue RateLimiter::LimitExceeded => e + [e.available_in, e.error_code] + end end diff --git a/spec/lib/middleware/request_tracker_spec.rb b/spec/lib/middleware/request_tracker_spec.rb index 0004c385645..190f97e9c95 100644 --- a/spec/lib/middleware/request_tracker_spec.rb +++ b/spec/lib/middleware/request_tracker_spec.rb @@ -369,6 +369,31 @@ RSpec.describe Middleware::RequestTracker do end end + describe "crawler rate limits" do + context "when there are multiple matching crawlers" do + before { SiteSetting.slow_down_crawler_user_agents = "badcrawler2|badcrawler22" } + + it "only checks limits for the first match" do + env = env("HTTP_USER_AGENT" => "badcrawler") + + status, _ = middleware.call(env) + expect(status).to eq(200) + end + end + + it "compares user agents in a case-insensitive manner" do + SiteSetting.slow_down_crawler_user_agents = "BaDCRawLer" + env1 = env("HTTP_USER_AGENT" => "bADcrAWLer") + env2 = env("HTTP_USER_AGENT" => "bADcrAWLer") + + status, _ = middleware.call(env1) + expect(status).to eq(200) + + status, _ = middleware.call(env2) + expect(status).to eq(429) + end + end + describe "register_ip_skipper" do before do Middleware::RequestTracker.register_ip_skipper { |ip| ip == "1.1.1.2" } diff --git a/spec/requests/application_controller_spec.rb b/spec/requests/application_controller_spec.rb index 56f414ad0cc..621db5936df 100644 --- a/spec/requests/application_controller_spec.rb +++ b/spec/requests/application_controller_spec.rb @@ -1103,13 +1103,17 @@ RSpec.describe ApplicationController do end describe "crawlers in slow_down_crawler_user_agents site setting" do - before { RateLimiter.enable } + before do + Fabricate(:admin) # to prevent redirect to the wizard + RateLimiter.enable + + SiteSetting.slow_down_crawler_rate = 128 + SiteSetting.slow_down_crawler_user_agents = "badcrawler|problematiccrawler" + end use_redis_snapshotting it "are rate limited" do - SiteSetting.slow_down_crawler_rate = 128 - SiteSetting.slow_down_crawler_user_agents = "badcrawler|problematiccrawler" now = Time.zone.now freeze_time now @@ -1141,6 +1145,21 @@ RSpec.describe ApplicationController do get "/", headers: { "HTTP_USER_AGENT" => "iam problematiccrawler" } expect(response.status).to eq(200) end + + context "with anonymous caching" do + before do + global_setting :anon_cache_store_threshold, 1 + Middleware::AnonymousCache.enable_anon_cache + end + + it "don't bypass crawler rate limits" do + get "/", headers: { "HTTP_USER_AGENT" => "iam badcrawler" } + expect(response.status).to eq(200) + + get "/", headers: { "HTTP_USER_AGENT" => "iam badcrawler" } + expect(response.status).to eq(429) + end + end end describe "#banner_json" do