FIX: move crawler blocking into anon cache

This refinement of previous fix moves the crawler blocking into anonymous cache This ensures we never poison the cache incorrectly when blocking crawlers
2025-06-03 19:39:30 +08:00 · 2018-07-04 11:14:43 +10:00
parent 7f98ed69cd
commit e72fd7ae4e
4 changed files with 99 additions and 81 deletions
--- a/spec/requests/application_controller_spec.rb
+++ b/spec/requests/application_controller_spec.rb
@ -33,71 +33,4 @@ RSpec.describe ApplicationController do
    end
  end

-  context "crawler blocking" do
-    let :non_crawler do
-      {
-        "HTTP_USER_AGENT" =>
-        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
-      }
-    end
-    it "applies whitelisted_crawler_user_agents correctly" do
-      SiteSetting.whitelisted_crawler_user_agents = 'Googlebot'
-
-      get '/srv/status', headers: {
-        'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
-      }
-
-      expect(response.status).to eq(200)
-
-      get '/srv/status', headers: {
-        'HTTP_USER_AGENT' => 'Anotherbot/2.1 (+http://www.notgoogle.com/bot.html)'
-      }
-
-      expect(response.status).to eq(403)
-
-      get '/srv/status', headers: non_crawler
-      expect(response.status).to eq(200)
-    end
-
-    it "applies blacklisted_crawler_user_agents correctly" do
-      SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
-
-      get '/srv/status', headers: non_crawler
-      expect(response.status).to eq(200)
-
-      get '/srv/status', headers: {
-        'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
-      }
-
-      expect(response.status).to eq(403)
-
-      get '/srv/status', headers: {
-        'HTTP_USER_AGENT' => 'Twitterbot/2.1 (+http://www.notgoogle.com/bot.html)'
-      }
-
-      expect(response.status).to eq(200)
-    end
-
-    it "blocked crawlers shouldn't log page views" do
-      ApplicationRequest.clear_cache!
-      SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
-      expect {
-        get '/srv/status', headers: {
-          'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
-        }
-        ApplicationRequest.write_cache!
-      }.to_not change { ApplicationRequest.count }
-    end
-
-    it "blocks json requests" do
-      SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
-
-      get '/srv/status.json', headers: {
-        'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
-      }
-
-      expect(response.status).to eq(403)
-    end
-  end
-
 end