From bb4e8899c41889f7316e512d6ed89a3847fa655b Mon Sep 17 00:00:00 2001 From: Sam Saffron Date: Mon, 11 May 2020 12:14:21 +1000 Subject: [PATCH] FEATURE: let Google index pages so it can remove them Google insists on indexing pages so it can figure out if they can be removed from the index. see: https://support.google.com/webmasters/answer/6332384?hl=en This change ensures the we have special behavior for Googlebot where we allow indexing, but block the actual indexing via X-Robots-Tag --- app/controllers/application_controller.rb | 8 +++++++- app/views/robots_txt/no_index.erb | 9 +++++++++ spec/requests/robots_txt_controller_spec.rb | 1 + spec/requests/topics_controller_spec.rb | 2 +- 4 files changed, 18 insertions(+), 2 deletions(-) diff --git a/app/controllers/application_controller.rb b/app/controllers/application_controller.rb index 7abf1f36596..abde04d24be 100644 --- a/app/controllers/application_controller.rb +++ b/app/controllers/application_controller.rb @@ -806,7 +806,13 @@ class ApplicationController < ActionController::Base end def add_noindex_header - response.headers['X-Robots-Tag'] = 'noindex' if request.get? + if request.get? + if SiteSetting.allow_index_in_robots_txt + response.headers['X-Robots-Tag'] = 'noindex' + else + response.headers['X-Robots-Tag'] = 'noindex, nofollow' + end + end end protected diff --git a/app/views/robots_txt/no_index.erb b/app/views/robots_txt/no_index.erb index 7697afcf260..a02cc1b1f5d 100644 --- a/app/views/robots_txt/no_index.erb +++ b/app/views/robots_txt/no_index.erb @@ -1,4 +1,13 @@ # See http://www.robotstxt.org/wc/norobots.html for documentation on how to use the robots.txt file # + +# Googlebot must be allowed to index so it can remove items from the index +# we return the X-Robots-Tag with noindex, nofollow which will ensure +# indexing is minimized and nothing shows up in Google search results +User-agent: googlebot +Allow: <%= Discourse.base_uri + "/" %> +Disallow: <%= Discourse.base_uri + "/uploads/*" %> + User-agent: * Disallow: <%= Discourse.base_uri + "/" %> + diff --git a/spec/requests/robots_txt_controller_spec.rb b/spec/requests/robots_txt_controller_spec.rb index c15297ff11d..16327997c9c 100644 --- a/spec/requests/robots_txt_controller_spec.rb +++ b/spec/requests/robots_txt_controller_spec.rb @@ -132,6 +132,7 @@ RSpec.describe RobotsTxtController do get '/robots.txt' expect(response.body).to_not include("Disallow: /u/") + expect(response.body).to include("User-agent: googlebot\nAllow") end it "returns overridden robots.txt if the file is overridden" do diff --git a/spec/requests/topics_controller_spec.rb b/spec/requests/topics_controller_spec.rb index b4d157d536f..b2c9b5ebdae 100644 --- a/spec/requests/topics_controller_spec.rb +++ b/spec/requests/topics_controller_spec.rb @@ -1853,7 +1853,7 @@ RSpec.describe TopicsController do get "/t/#{topic.slug}/#{topic.id}.json" - expect(response.headers['X-Robots-Tag']).to eq('noindex') + expect(response.headers['X-Robots-Tag']).to eq('noindex, nofollow') end it "doesn't store an incoming link when there's no referer" do