From bb4e8899c41889f7316e512d6ed89a3847fa655b Mon Sep 17 00:00:00 2001
From: Sam Saffron <sam.saffron@gmail.com>
Date: Mon, 11 May 2020 12:14:21 +1000
Subject: [PATCH] FEATURE: let Google index pages so it can remove them

Google insists on indexing pages so it can figure out if they
can be removed from the index.

see: https://support.google.com/webmasters/answer/6332384?hl=en

This change ensures the we have special behavior for Googlebot
where we allow indexing, but block the actual indexing via
X-Robots-Tag
---
 app/controllers/application_controller.rb   | 8 +++++++-
 app/views/robots_txt/no_index.erb           | 9 +++++++++
 spec/requests/robots_txt_controller_spec.rb | 1 +
 spec/requests/topics_controller_spec.rb     | 2 +-
 4 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/app/controllers/application_controller.rb b/app/controllers/application_controller.rb
index 7abf1f36596..abde04d24be 100644
--- a/app/controllers/application_controller.rb
+++ b/app/controllers/application_controller.rb
@@ -806,7 +806,13 @@ class ApplicationController < ActionController::Base
   end
 
   def add_noindex_header
-    response.headers['X-Robots-Tag'] = 'noindex' if request.get?
+    if request.get?
+      if SiteSetting.allow_index_in_robots_txt
+        response.headers['X-Robots-Tag'] = 'noindex'
+      else
+        response.headers['X-Robots-Tag'] = 'noindex, nofollow'
+      end
+    end
   end
 
   protected
diff --git a/app/views/robots_txt/no_index.erb b/app/views/robots_txt/no_index.erb
index 7697afcf260..a02cc1b1f5d 100644
--- a/app/views/robots_txt/no_index.erb
+++ b/app/views/robots_txt/no_index.erb
@@ -1,4 +1,13 @@
 # See http://www.robotstxt.org/wc/norobots.html for documentation on how to use the robots.txt file
 #
+
+# Googlebot must be allowed to index so it can remove items from the index
+# we return the X-Robots-Tag with noindex, nofollow which will ensure
+# indexing is minimized and nothing shows up in Google search results
+User-agent: googlebot
+Allow: <%= Discourse.base_uri + "/" %>
+Disallow: <%= Discourse.base_uri + "/uploads/*" %>
+
 User-agent: *
 Disallow: <%= Discourse.base_uri + "/" %>
+
diff --git a/spec/requests/robots_txt_controller_spec.rb b/spec/requests/robots_txt_controller_spec.rb
index c15297ff11d..16327997c9c 100644
--- a/spec/requests/robots_txt_controller_spec.rb
+++ b/spec/requests/robots_txt_controller_spec.rb
@@ -132,6 +132,7 @@ RSpec.describe RobotsTxtController do
       get '/robots.txt'
 
       expect(response.body).to_not include("Disallow: /u/")
+      expect(response.body).to include("User-agent: googlebot\nAllow")
     end
 
     it "returns overridden robots.txt if the file is overridden" do
diff --git a/spec/requests/topics_controller_spec.rb b/spec/requests/topics_controller_spec.rb
index b4d157d536f..b2c9b5ebdae 100644
--- a/spec/requests/topics_controller_spec.rb
+++ b/spec/requests/topics_controller_spec.rb
@@ -1853,7 +1853,7 @@ RSpec.describe TopicsController do
 
       get "/t/#{topic.slug}/#{topic.id}.json"
 
-      expect(response.headers['X-Robots-Tag']).to eq('noindex')
+      expect(response.headers['X-Robots-Tag']).to eq('noindex, nofollow')
     end
 
     it "doesn't store an incoming link when there's no referer" do