From 3d7dbdedc0427612b20df068a8c6c315e3f08ae1 Mon Sep 17 00:00:00 2001 From: Robin Ward Date: Mon, 16 Apr 2018 15:43:20 -0400 Subject: [PATCH] FEATURE: An API to help sites build robots.txt files programatically This is mainly useful for subfolder sites, who need to expose their robots.txt contents to a parent site. --- app/controllers/robots_txt_controller.rb | 63 ++++++++++++++++----- app/views/robots_txt/index.erb | 28 +++------ config/routes.rb | 1 + spec/requests/robots_txt_controller_spec.rb | 10 ++++ 4 files changed, 67 insertions(+), 35 deletions(-) diff --git a/app/controllers/robots_txt_controller.rb b/app/controllers/robots_txt_controller.rb index b5dfdb3e626..98153556329 100644 --- a/app/controllers/robots_txt_controller.rb +++ b/app/controllers/robots_txt_controller.rb @@ -36,26 +36,59 @@ class RobotsTxtController < ApplicationController } def index - if SiteSetting.allow_index_in_robots_txt - path = :index + if SiteSetting.allow_index_in_robots_txt? + @robots_info = fetch_robots_info + render :index, content_type: 'text/plain' + else + render :no_index, content_type: 'text/plain' + end + end - @crawler_delayed_agents = SiteSetting.slow_down_crawler_user_agents.split('|').map { |agent| - [agent, SiteSetting.slow_down_crawler_rate] - } + # If you are hosting Discourse in a subfolder, you will need to create your robots.txt + # in the root of your web server with the appropriate paths. This method will return + # JSON that can be used by a script to create a robots.txt that works well with your + # existing site. + def builder + render json: fetch_robots_info + end - if SiteSetting.whitelisted_crawler_user_agents.present? - @allowed_user_agents = SiteSetting.whitelisted_crawler_user_agents.split('|') - @disallowed_user_agents = ['*'] - elsif SiteSetting.blacklisted_crawler_user_agents.present? - @allowed_user_agents = ['*'] - @disallowed_user_agents = SiteSetting.blacklisted_crawler_user_agents.split('|') - else - @allowed_user_agents = ['*'] +protected + + def fetch_robots_info + deny_paths = DISALLOWED_PATHS.map { |p| Discourse.base_uri + p } + deny_all = [ "#{Discourse.base_uri}/" ] + + result = { + header: "# See http://www.robotstxt.org/robotstxt.html for documentation on how to use the robots.txt file", + agents: [] + } + + if SiteSetting.whitelisted_crawler_user_agents.present? + SiteSetting.whitelisted_crawler_user_agents.split('|').each do |agent| + result[:agents] << { name: agent, disallow: deny_paths } + end + + result[:agents] << { name: '*', disallow: deny_all } + elsif SiteSetting.blacklisted_crawler_user_agents.present? + result[:agents] << { name: '*', disallow: deny_paths } + SiteSetting.blacklisted_crawler_user_agents.split('|').each do |agent| + result[:agents] << { name: agent, disallow: deny_all } end else - path = :no_index + result[:agents] << { name: '*', disallow: deny_paths } end - render path, content_type: 'text/plain' + if SiteSetting.slow_down_crawler_user_agents.present? + SiteSetting.slow_down_crawler_user_agents.split('|').each do |agent| + result[:agents] << { + name: agent, + delay: SiteSetting.slow_down_crawler_rate, + disallow: deny_paths + } + end + end + + result end + end diff --git a/app/views/robots_txt/index.erb b/app/views/robots_txt/index.erb index 2e9105c902f..71ca94baa7b 100644 --- a/app/views/robots_txt/index.erb +++ b/app/views/robots_txt/index.erb @@ -1,30 +1,18 @@ -# See http://www.robotstxt.org/robotstxt.html for documentation on how to use the robots.txt file +<%= @robots_info[:header] %> <% if Discourse.base_uri.present? %> # This robots.txt file is not used. Please append the content below in the robots.txt file located at the root <% end %> # -<% @allowed_user_agents.each do |user_agent| %> -User-agent: <%= user_agent %> -<% end %> -<% RobotsTxtController::DISALLOWED_PATHS.each do |path| %> -Disallow: <%= Discourse.base_uri + path %> +<% @robots_info[:agents].each do |agent| %> +User-agent: <%= agent[:name] %> +<%- if agent[:delay] -%> +Crawl-delay: <%= agent[:delay] %> +<%- end -%> +<% agent[:disallow].each do |path| %> +Disallow: <%= path %> <% end %> -<% if @disallowed_user_agents %> - <% @disallowed_user_agents.each do |user_agent| %> -User-agent: <%= user_agent %> -Disallow: <%= Discourse.base_uri + "/" %> - <% end %> <% end %> <%= server_plugin_outlet "robots_txt_index" %> - -<% @crawler_delayed_agents.each do |agent, delay| %> -User-agent: <%= agent %> -Crawl-delay: <%= delay %> -<% RobotsTxtController::DISALLOWED_PATHS.each do |path| %> -Disallow: <%= Discourse.base_uri + path %> -<% end %> - -<% end %> diff --git a/config/routes.rb b/config/routes.rb index cc191d986a8..d64c0f28c4b 100644 --- a/config/routes.rb +++ b/config/routes.rb @@ -744,6 +744,7 @@ Discourse::Application.routes.draw do get "favicon/proxied" => "static#favicon", format: false get "robots.txt" => "robots_txt#index" + get "robots-builder.json" => "robots_txt#builder" get "offline.html" => "offline#index" get "manifest.json" => "metadata#manifest", as: :manifest get "opensearch" => "metadata#opensearch", format: :xml diff --git a/spec/requests/robots_txt_controller_spec.rb b/spec/requests/robots_txt_controller_spec.rb index 87232518cb6..c5627c0f19b 100644 --- a/spec/requests/robots_txt_controller_spec.rb +++ b/spec/requests/robots_txt_controller_spec.rb @@ -1,6 +1,16 @@ require 'rails_helper' RSpec.describe RobotsTxtController do + describe '#builder' do + it "returns json information for building a robots.txt" do + get "/robots-builder.json" + json = ::JSON.parse(response.body) + expect(json).to be_present + expect(json['header']).to be_present + expect(json['agents']).to be_present + end + end + describe '#index' do context 'subfolder' do