prefix the robots.txt rules with the directory when using subfolder

This commit is contained in:
Régis Hanol
2018-04-11 22:05:02 +02:00
parent 3c8b43bb01
commit df7970a6f6
4 changed files with 49 additions and 35 deletions

View File

@ -2,14 +2,46 @@ class RobotsTxtController < ApplicationController
layout false layout false
skip_before_action :preload_json, :check_xhr, :redirect_to_login_if_required skip_before_action :preload_json, :check_xhr, :redirect_to_login_if_required
# NOTE: order is important!
DISALLOWED_PATHS ||= %w{
/auth/cas
/auth/facebook/callback
/auth/twitter/callback
/auth/google/callback
/auth/yahoo/callback
/auth/github/callback
/auth/cas/callback
/assets/browser-update*.js
/users/
/u/
/badges/
/search
/search/
/tags
/tags/
/email/
/session
/session/
/admin
/admin/
/user-api-key
/user-api-key/
/*?api_key*
/*?*api_key*
/groups
/groups/
/t/*/*.rss
/tags/*.rss
/c/*.rss
}
def index def index
if SiteSetting.allow_index_in_robots_txt if SiteSetting.allow_index_in_robots_txt
path = :index path = :index
@crawler_delayed_agents = []
SiteSetting.slow_down_crawler_user_agents.split('|').each do |agent| @crawler_delayed_agents = SiteSetting.slow_down_crawler_user_agents.split('|').map { |agent|
@crawler_delayed_agents << [agent, SiteSetting.slow_down_crawler_rate] [agent, SiteSetting.slow_down_crawler_rate]
end }
if SiteSetting.whitelisted_crawler_user_agents.present? if SiteSetting.whitelisted_crawler_user_agents.present?
@allowed_user_agents = SiteSetting.whitelisted_crawler_user_agents.split('|') @allowed_user_agents = SiteSetting.whitelisted_crawler_user_agents.split('|')

View File

@ -3,40 +3,14 @@
<% @allowed_user_agents.each do |user_agent| %> <% @allowed_user_agents.each do |user_agent| %>
User-agent: <%= user_agent %> User-agent: <%= user_agent %>
<% end %> <% end %>
Disallow: /auth/cas <% RobotsTxtController::DISALLOWED_PATHS.each do |path| %>
Disallow: /auth/facebook/callback Disallow: <%= Discourse.base_uri + path %>
Disallow: /auth/twitter/callback <% end %>
Disallow: /auth/google/callback
Disallow: /auth/yahoo/callback
Disallow: /auth/github/callback
Disallow: /auth/cas/callback
Disallow: /assets/browser-update*.js
Disallow: /users/
Disallow: /u/
Disallow: /badges/
Disallow: /search
Disallow: /search/
Disallow: /tags
Disallow: /tags/
Disallow: /email/
Disallow: /session
Disallow: /session/
Disallow: /admin
Disallow: /admin/
Disallow: /user-api-key
Disallow: /user-api-key/
Disallow: /*?api_key*
Disallow: /*?*api_key*
Disallow: /groups
Disallow: /groups/
Disallow: /t/*/*.rss
Disallow: /tags/*.rss
Disallow: /c/*.rss
<% if @disallowed_user_agents %> <% if @disallowed_user_agents %>
<% @disallowed_user_agents.each do |user_agent| %> <% @disallowed_user_agents.each do |user_agent| %>
User-agent: <%= user_agent %> User-agent: <%= user_agent %>
Disallow: / Disallow: <%= Discourse.base_uri + "/" %>
<% end %> <% end %>
<% end %> <% end %>

View File

@ -1,4 +1,4 @@
# See http://www.robotstxt.org/wc/norobots.html for documentation on how to use the robots.txt file # See http://www.robotstxt.org/wc/norobots.html for documentation on how to use the robots.txt file
# #
User-agent: * User-agent: *
Disallow: / Disallow: <%= Discourse.base_uri + "/" %>

View File

@ -3,6 +3,14 @@ require 'rails_helper'
RSpec.describe RobotsTxtController do RSpec.describe RobotsTxtController do
describe '#index' do describe '#index' do
context 'subfolder' do
it 'prefixes the rules with the directory' do
Discourse.stubs(:base_uri).returns('/forum')
get '/robots.txt'
expect(response.body).to include("\nDisallow: /forum/admin")
end
end
context 'crawl delay' do context 'crawl delay' do
it 'allows you to set crawl delay on particular bots' do it 'allows you to set crawl delay on particular bots' do
SiteSetting.allow_index_in_robots_txt = true SiteSetting.allow_index_in_robots_txt = true