FIX: move crawler blocking into anon cache

This refinement of previous fix moves the crawler blocking into
anonymous cache

This ensures we never poison the cache incorrectly when blocking crawlers
This commit is contained in:
Sam
2018-07-04 11:14:43 +10:00
parent 7f98ed69cd
commit e72fd7ae4e
4 changed files with 99 additions and 81 deletions

View File

@ -152,4 +152,91 @@ describe Middleware::AnonymousCache::Helper do
end
end
context "crawler blocking" do
let :non_crawler do
{
"HTTP_USER_AGENT" =>
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
}
end
def get(path, options)
middleware = Middleware::AnonymousCache.new(lambda { |_| [200, {}, []] })
@env = env({
"REQUEST_URI" => path,
"PATH_INFO" => path,
"REQUEST_PATH" => path
}.merge(options[:headers]))
@status = middleware.call(@env).first
end
it "applies whitelisted_crawler_user_agents correctly" do
SiteSetting.whitelisted_crawler_user_agents = 'Googlebot'
get '/srv/status', headers: {
'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
}
expect(@status).to eq(200)
get '/srv/status', headers: {
'HTTP_USER_AGENT' => 'Anotherbot/2.1 (+http://www.notgoogle.com/bot.html)'
}
expect(@status).to eq(403)
get '/srv/status', headers: non_crawler
expect(@status).to eq(200)
end
it "applies blacklisted_crawler_user_agents correctly" do
SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
get '/srv/status', headers: non_crawler
expect(@status).to eq(200)
get '/srv/status', headers: {
'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
}
expect(@status).to eq(403)
get '/srv/status', headers: {
'HTTP_USER_AGENT' => 'Twitterbot/2.1 (+http://www.notgoogle.com/bot.html)'
}
expect(@status).to eq(200)
end
it "should never block robots.txt" do
SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
get '/robots.txt', headers: {
'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
}
expect(@status).to eq(200)
end
it "blocked crawlers shouldn't log page views" do
SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
get '/srv/status', headers: {
'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
}
expect(@env["discourse.request_tracker.skip"]).to eq(true)
end
it "blocks json requests" do
SiteSetting.blacklisted_crawler_user_agents = 'Googlebot'
get '/srv/status.json', headers: {
'HTTP_USER_AGENT' => 'Googlebot/2.1 (+http://www.google.com/bot.html)'
}
expect(@status).to eq(403)
end
end
end