mirror of
https://github.com/discourse/discourse.git
synced 2025-06-06 12:26:04 +08:00
FEATURE: explicitly ban outlier traffic sources in robots.txt (#11553)
Googlebot handles no-index headers very elegantly. It advises to leave as many routes as possible open and uses headers for high fidelity rules regarding indexes. Discourse adds special `x-robot-tags` noindex headers to users, badges, groups, search and tag routes. Following up on b52143feff8c32f2 we now have it so Googlebot gets special handling. Rest of the crawlers get a far more aggressive disallow list to protect against excessive crawling.
This commit is contained in:
@ -8,17 +8,27 @@ class RobotsTxtController < ApplicationController
|
|||||||
|
|
||||||
# NOTE: order is important!
|
# NOTE: order is important!
|
||||||
DISALLOWED_PATHS ||= %w{
|
DISALLOWED_PATHS ||= %w{
|
||||||
|
/admin/
|
||||||
/auth/
|
/auth/
|
||||||
/assets/browser-update*.js
|
/assets/browser-update*.js
|
||||||
/email/
|
/email/
|
||||||
/session
|
/session
|
||||||
/session/
|
|
||||||
/user-api-key
|
/user-api-key
|
||||||
/user-api-key/
|
|
||||||
/*?api_key*
|
/*?api_key*
|
||||||
/*?*api_key*
|
/*?*api_key*
|
||||||
}
|
}
|
||||||
|
|
||||||
|
DISALLOWED_WITH_HEADER_PATHS ||= %w{
|
||||||
|
/badges
|
||||||
|
/u
|
||||||
|
/my
|
||||||
|
/search
|
||||||
|
/tag
|
||||||
|
/g
|
||||||
|
/t/*/*.rss
|
||||||
|
/c/*.rss
|
||||||
|
}
|
||||||
|
|
||||||
def index
|
def index
|
||||||
if (overridden = SiteSetting.overridden_robots_txt.dup).present?
|
if (overridden = SiteSetting.overridden_robots_txt.dup).present?
|
||||||
overridden.prepend(OVERRIDDEN_HEADER) if guardian.is_admin? && !is_api?
|
overridden.prepend(OVERRIDDEN_HEADER) if guardian.is_admin? && !is_api?
|
||||||
@ -45,7 +55,8 @@ class RobotsTxtController < ApplicationController
|
|||||||
end
|
end
|
||||||
|
|
||||||
def self.fetch_default_robots_info
|
def self.fetch_default_robots_info
|
||||||
deny_paths = DISALLOWED_PATHS.map { |p| Discourse.base_path + p }
|
deny_paths_googlebot = DISALLOWED_PATHS.map { |p| Discourse.base_path + p }
|
||||||
|
deny_paths = deny_paths_googlebot + DISALLOWED_WITH_HEADER_PATHS.map { |p| Discourse.base_path + p }
|
||||||
deny_all = [ "#{Discourse.base_path}/" ]
|
deny_all = [ "#{Discourse.base_path}/" ]
|
||||||
|
|
||||||
result = {
|
result = {
|
||||||
@ -55,17 +66,22 @@ class RobotsTxtController < ApplicationController
|
|||||||
|
|
||||||
if SiteSetting.allowed_crawler_user_agents.present?
|
if SiteSetting.allowed_crawler_user_agents.present?
|
||||||
SiteSetting.allowed_crawler_user_agents.split('|').each do |agent|
|
SiteSetting.allowed_crawler_user_agents.split('|').each do |agent|
|
||||||
result[:agents] << { name: agent, disallow: deny_paths }
|
paths = agent == "Googlebot" ? deny_paths_googlebot : deny_paths
|
||||||
|
result[:agents] << { name: agent, disallow: paths }
|
||||||
end
|
end
|
||||||
|
|
||||||
result[:agents] << { name: '*', disallow: deny_all }
|
result[:agents] << { name: '*', disallow: deny_all }
|
||||||
elsif SiteSetting.blocked_crawler_user_agents.present?
|
|
||||||
result[:agents] << { name: '*', disallow: deny_paths }
|
|
||||||
SiteSetting.blocked_crawler_user_agents.split('|').each do |agent|
|
|
||||||
result[:agents] << { name: agent, disallow: deny_all }
|
|
||||||
end
|
|
||||||
else
|
else
|
||||||
|
|
||||||
|
if SiteSetting.blocked_crawler_user_agents.present?
|
||||||
|
SiteSetting.blocked_crawler_user_agents.split('|').each do |agent|
|
||||||
|
result[:agents] << { name: agent, disallow: deny_all }
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
result[:agents] << { name: '*', disallow: deny_paths }
|
result[:agents] << { name: '*', disallow: deny_paths }
|
||||||
|
|
||||||
|
result[:agents] << { name: 'Googlebot', disallow: deny_paths_googlebot }
|
||||||
end
|
end
|
||||||
|
|
||||||
if SiteSetting.slow_down_crawler_user_agents.present?
|
if SiteSetting.slow_down_crawler_user_agents.present?
|
||||||
|
@ -91,6 +91,8 @@ RSpec.describe RobotsTxtController do
|
|||||||
i = response.body.index('User-agent: *')
|
i = response.body.index('User-agent: *')
|
||||||
expect(i).to be_present
|
expect(i).to be_present
|
||||||
expect(response.body[i..-1]).to include("Disallow: /auth/")
|
expect(response.body[i..-1]).to include("Disallow: /auth/")
|
||||||
|
# we have to insert Googlebot for special handling
|
||||||
|
expect(response.body[i..-1]).to include("User-agent: Googlebot")
|
||||||
end
|
end
|
||||||
|
|
||||||
it "can allowlist user agents" do
|
it "can allowlist user agents" do
|
||||||
|
Reference in New Issue
Block a user