From 43da88db6c63d9be4337d5b7a5243895fb7916b5 Mon Sep 17 00:00:00 2001 From: Sam Date: Wed, 9 Mar 2022 18:25:20 +1100 Subject: [PATCH] PERF: avoid following links in topic RSS feeds (#16145) Topic RSS feeds contain many non canonical links such as: - https://site.com/t/a-b-c/111/1 - https://site.com/t/a-b-c/111/2 - https://site.com/t/a-b-c/111/3 - https://site.com/t/a-b-c/111/4 - https://site.com/t/a-b-c/111/5 - https://site.com/t/a-b-c/111/6 Previously we were not indexing RSS feeds yet still following these links. This change means we totally ignore links in the RSS feeds which avoids expensive work scanning them just to find we should not include them. --- app/controllers/application_controller.rb | 2 +- app/controllers/topics_controller.rb | 2 ++ spec/requests/topics_controller_spec.rb | 5 +++++ 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/app/controllers/application_controller.rb b/app/controllers/application_controller.rb index b4b1013b372..22f47bd8bdb 100644 --- a/app/controllers/application_controller.rb +++ b/app/controllers/application_controller.rb @@ -897,7 +897,7 @@ class ApplicationController < ActionController::Base end def add_noindex_header - if request.get? + if request.get? && !response.headers['X-Robots-Tag'] if SiteSetting.allow_index_in_robots_txt response.headers['X-Robots-Tag'] = 'noindex' else diff --git a/app/controllers/topics_controller.rb b/app/controllers/topics_controller.rb index 5d1513fbcfd..da12f1c86e1 100644 --- a/app/controllers/topics_controller.rb +++ b/app/controllers/topics_controller.rb @@ -922,6 +922,8 @@ class TopicsController < ApplicationController end discourse_expires_in 1.minute + + response.headers['X-Robots-Tag'] = 'noindex, nofollow' render 'topics/show', formats: [:rss] end diff --git a/spec/requests/topics_controller_spec.rb b/spec/requests/topics_controller_spec.rb index 6b7ca165742..ed2cb355663 100644 --- a/spec/requests/topics_controller_spec.rb +++ b/spec/requests/topics_controller_spec.rb @@ -2850,6 +2850,11 @@ RSpec.describe TopicsController do get "/t/foo/#{topic.id}.rss" expect(response.status).to eq(200) expect(response.media_type).to eq('application/rss+xml') + + # our RSS feed is full of post 1/2/3/4/5 links, we do not want it included + # in the index, and do not want links followed + # this allows us to remove it while allowing via robots.txt + expect(response.headers['X-Robots-Tag']).to eq('noindex, nofollow') end it 'renders rss of the topic correctly with subfolder' do