From 3d7dbdedc0427612b20df068a8c6c315e3f08ae1 Mon Sep 17 00:00:00 2001
From: Robin Ward <robin.ward@gmail.com>
Date: Mon, 16 Apr 2018 15:43:20 -0400
Subject: [PATCH] FEATURE: An API to help sites build robots.txt files
 programatically

This is mainly useful for subfolder sites, who need to expose their
robots.txt contents to a parent site.
---
 app/controllers/robots_txt_controller.rb    | 63 ++++++++++++++++-----
 app/views/robots_txt/index.erb              | 28 +++------
 config/routes.rb                            |  1 +
 spec/requests/robots_txt_controller_spec.rb | 10 ++++
 4 files changed, 67 insertions(+), 35 deletions(-)

diff --git a/app/controllers/robots_txt_controller.rb b/app/controllers/robots_txt_controller.rb
index b5dfdb3e626..98153556329 100644
--- a/app/controllers/robots_txt_controller.rb
+++ b/app/controllers/robots_txt_controller.rb
@@ -36,26 +36,59 @@ class RobotsTxtController < ApplicationController
   }
 
   def index
-    if SiteSetting.allow_index_in_robots_txt
-      path = :index
+    if SiteSetting.allow_index_in_robots_txt?
+      @robots_info = fetch_robots_info
+      render :index, content_type: 'text/plain'
+    else
+      render :no_index, content_type: 'text/plain'
+    end
+  end
 
-      @crawler_delayed_agents = SiteSetting.slow_down_crawler_user_agents.split('|').map { |agent|
-        [agent, SiteSetting.slow_down_crawler_rate]
-      }
+  # If you are hosting Discourse in a subfolder, you will need to create your robots.txt
+  # in the root of your web server with the appropriate paths. This method will return
+  # JSON that can be used by a script to create a robots.txt that works well with your
+  # existing site.
+  def builder
+    render json: fetch_robots_info
+  end
 
-      if SiteSetting.whitelisted_crawler_user_agents.present?
-        @allowed_user_agents = SiteSetting.whitelisted_crawler_user_agents.split('|')
-        @disallowed_user_agents = ['*']
-      elsif SiteSetting.blacklisted_crawler_user_agents.present?
-        @allowed_user_agents = ['*']
-        @disallowed_user_agents = SiteSetting.blacklisted_crawler_user_agents.split('|')
-      else
-        @allowed_user_agents = ['*']
+protected
+
+  def fetch_robots_info
+    deny_paths = DISALLOWED_PATHS.map { |p| Discourse.base_uri + p }
+    deny_all = [ "#{Discourse.base_uri}/" ]
+
+    result = {
+      header: "# See http://www.robotstxt.org/robotstxt.html for documentation on how to use the robots.txt file",
+      agents: []
+    }
+
+    if SiteSetting.whitelisted_crawler_user_agents.present?
+      SiteSetting.whitelisted_crawler_user_agents.split('|').each do |agent|
+        result[:agents] << { name: agent, disallow: deny_paths }
+      end
+
+      result[:agents] << { name: '*', disallow: deny_all }
+    elsif SiteSetting.blacklisted_crawler_user_agents.present?
+      result[:agents] << { name: '*', disallow: deny_paths }
+      SiteSetting.blacklisted_crawler_user_agents.split('|').each do |agent|
+        result[:agents] << { name: agent, disallow: deny_all }
       end
     else
-      path = :no_index
+      result[:agents] << { name: '*', disallow: deny_paths }
     end
 
-    render path, content_type: 'text/plain'
+    if SiteSetting.slow_down_crawler_user_agents.present?
+      SiteSetting.slow_down_crawler_user_agents.split('|').each do |agent|
+        result[:agents] << {
+          name: agent,
+          delay: SiteSetting.slow_down_crawler_rate,
+          disallow: deny_paths
+        }
+      end
+    end
+
+    result
   end
+
 end
diff --git a/app/views/robots_txt/index.erb b/app/views/robots_txt/index.erb
index 2e9105c902f..71ca94baa7b 100644
--- a/app/views/robots_txt/index.erb
+++ b/app/views/robots_txt/index.erb
@@ -1,30 +1,18 @@
-# See http://www.robotstxt.org/robotstxt.html for documentation on how to use the robots.txt file
+<%= @robots_info[:header] %>
 <% if Discourse.base_uri.present? %>
 # This robots.txt file is not used. Please append the content below in the robots.txt file located at the root
 <% end %>
 #
-<% @allowed_user_agents.each do |user_agent| %>
-User-agent: <%= user_agent %>
-<% end %>
-<% RobotsTxtController::DISALLOWED_PATHS.each do |path| %>
-Disallow: <%= Discourse.base_uri + path %>
+<% @robots_info[:agents].each do |agent| %>
+User-agent: <%= agent[:name] %>
+<%- if agent[:delay] -%>
+Crawl-delay: <%= agent[:delay] %>
+<%- end -%>
+<% agent[:disallow].each do |path| %>
+Disallow: <%= path %>
 <% end %>
 
-<% if @disallowed_user_agents %>
-  <% @disallowed_user_agents.each do |user_agent| %>
-User-agent: <%= user_agent %>
-Disallow: <%= Discourse.base_uri + "/" %>
 
-  <% end %>
 <% end %>
 
 <%= server_plugin_outlet "robots_txt_index" %>
-
-<% @crawler_delayed_agents.each do |agent, delay| %>
-User-agent: <%= agent %>
-Crawl-delay: <%= delay %>
-<% RobotsTxtController::DISALLOWED_PATHS.each do |path| %>
-Disallow: <%= Discourse.base_uri + path %>
-<% end %>
-
-<% end %>
diff --git a/config/routes.rb b/config/routes.rb
index cc191d986a8..d64c0f28c4b 100644
--- a/config/routes.rb
+++ b/config/routes.rb
@@ -744,6 +744,7 @@ Discourse::Application.routes.draw do
   get "favicon/proxied" => "static#favicon", format: false
 
   get "robots.txt" => "robots_txt#index"
+  get "robots-builder.json" => "robots_txt#builder"
   get "offline.html" => "offline#index"
   get "manifest.json" => "metadata#manifest", as: :manifest
   get "opensearch" => "metadata#opensearch", format: :xml
diff --git a/spec/requests/robots_txt_controller_spec.rb b/spec/requests/robots_txt_controller_spec.rb
index 87232518cb6..c5627c0f19b 100644
--- a/spec/requests/robots_txt_controller_spec.rb
+++ b/spec/requests/robots_txt_controller_spec.rb
@@ -1,6 +1,16 @@
 require 'rails_helper'
 
 RSpec.describe RobotsTxtController do
+  describe '#builder' do
+    it "returns json information for building a robots.txt" do
+      get "/robots-builder.json"
+      json = ::JSON.parse(response.body)
+      expect(json).to be_present
+      expect(json['header']).to be_present
+      expect(json['agents']).to be_present
+    end
+  end
+
   describe '#index' do
 
     context 'subfolder' do