From b0656f3ed0e4c296aa9c69a278289c679578ae87 Mon Sep 17 00:00:00 2001 From: Osama Sayegh Date: Fri, 11 Mar 2022 09:18:12 +0300 Subject: [PATCH] FIX: Apply onebox blocked domain checks on every redirect (#16150) The `blocked onebox domains` setting lets site owners change what sites are allowed to be oneboxed. When a link is entered into a post, Discourse checks the domain of the link against that setting and blocks the onebox if the domain is blocked. But if there's a chain of redirects, then only the final destination website is checked against the site setting. This commit amends that behavior so that every website in the redirect chain is checked against the site setting, and if anything is blocked the original link doesn't onebox at all in the post. The `Discourse-No-Onebox` header is also checked in every response and the onebox is blocked if the header is set to "1". Additionally, Discourse will now include the `Discourse-No-Onebox` header with every response if the site requires login to access content. This is done to signal to a Discourse instance that it shouldn't attempt to onebox other Discourse instances if they're login-only. Non-Discourse websites can also use include that header if they don't wish to have Discourse onebox their content. Internal ticket: t59305. --- app/controllers/application_controller.rb | 3 + lib/final_destination.rb | 49 +++++++++++++-- lib/oneboxer.rb | 9 ++- lib/retrieve_title.rb | 11 ++-- spec/lib/inline_oneboxer_spec.rb | 72 +++++++++++++++++------ spec/lib/oneboxer_spec.rb | 50 ++++++++++++++-- spec/lib/retrieve_title_spec.rb | 15 ++++- 7 files changed, 171 insertions(+), 38 deletions(-) diff --git a/app/controllers/application_controller.rb b/app/controllers/application_controller.rb index 22f47bd8bdb..97c8f721519 100644 --- a/app/controllers/application_controller.rb +++ b/app/controllers/application_controller.rb @@ -90,6 +90,9 @@ class ApplicationController < ActionController::Base response.cache_control[:no_cache] = true response.cache_control[:extras] = ["no-store"] end + if SiteSetting.login_required + response.headers['Discourse-No-Onebox'] = '1' + end end def conditionally_allow_site_embedding diff --git a/lib/final_destination.rb b/lib/final_destination.rb index 5909640c154..11561f17f13 100644 --- a/lib/final_destination.rb +++ b/lib/final_destination.rb @@ -74,6 +74,7 @@ class FinalDestination @preserve_fragment_url = @preserve_fragment_url_hosts.any? { |host| hostname_matches?(host) } @validate_uri = @opts.fetch(:validate_uri) { true } @user_agent = @force_custom_user_agent_hosts.any? { |host| hostname_matches?(host) } ? Onebox.options.user_agent : @default_user_agent + @stop_at_blocked_pages = @opts[:stop_at_blocked_pages] end def self.connection_timeout @@ -140,9 +141,12 @@ class FinalDestination uri = URI(uri.to_s) end - return nil unless validate_uri + return if !validate_uri + return if @stop_at_blocked_pages && blocked_domain?(uri) - result, (location, cookie) = safe_get(uri, &blk) + result, headers_subset = safe_get(uri, &blk) + cookie = headers_subset.set_cookie + location = headers_subset.location if result == :redirect && (redirects == 0 || !location) return nil @@ -222,6 +226,13 @@ class FinalDestination response_block: request_validator ) + if @stop_at_blocked_pages + if blocked_domain?(@uri) || response.headers['Discourse-No-Onebox'] == "1" + @status = :blocked_page + return + end + end + location = nil response_headers = nil response_status = response.status.to_i @@ -253,6 +264,18 @@ class FinalDestination when 103, 400, 405, 406, 409, 500, 501 response_status, small_headers = small_get(request_headers) + if @stop_at_blocked_pages + # this may seem weird, but the #to_hash method of the response object + # of ruby's net/http lib returns a hash where each value is an array. + # small_headers here is like that so our no onebox header value is an + # array if it's set. Also the hash keys are always lower-cased. + dont_onebox = small_headers["discourse-no-onebox"]&.join("") == "1" + if dont_onebox || blocked_domain?(@uri) + @status = :blocked_page + return + end + end + if response_status == 200 @status = :resolved return @uri @@ -425,6 +448,7 @@ class FinalDestination def safe_get(uri) result = nil unsafe_close = false + headers_subset = Struct.new(:location, :set_cookie).new safe_session(uri) do |http| headers = request_headers.merge( @@ -435,8 +459,19 @@ class FinalDestination req = Net::HTTP::Get.new(uri.request_uri, headers) http.request(req) do |resp| + headers_subset.set_cookie = resp['Set-Cookie'] + + if @stop_at_blocked_pages + dont_onebox = resp["Discourse-No-Onebox"] == "1" + if dont_onebox + result = :blocked, headers_subset + next + end + end + if Net::HTTPRedirection === resp - result = :redirect, [resp['location'], resp['Set-Cookie']] + headers_subset.location = resp['location'] + result = :redirect, headers_subset end if Net::HTTPSuccess === resp @@ -460,7 +495,7 @@ class FinalDestination raise StandardError end end - result = :ok + result = :ok, headers_subset else catch(:done) do yield resp, nil, nil @@ -471,7 +506,7 @@ class FinalDestination result rescue StandardError - unsafe_close ? :ok : raise + unsafe_close ? [:ok, headers_subset] : raise end def safe_session(uri) @@ -505,4 +540,8 @@ class FinalDestination uri(complete_url) end + + def blocked_domain?(uri) + Onebox::DomainChecker.is_blocked?(uri.hostname) + end end diff --git a/lib/oneboxer.rb b/lib/oneboxer.rb index 132f30d6b25..29585fdc54f 100644 --- a/lib/oneboxer.rb +++ b/lib/oneboxer.rb @@ -397,9 +397,14 @@ module Oneboxer available_strategies ||= Oneboxer.ordered_strategies(uri.hostname) strategy = available_strategies.shift - fd = FinalDestination.new(url, get_final_destination_options(url, strategy)) + fd = FinalDestination.new( + url, + get_final_destination_options(url, strategy).merge(stop_at_blocked_pages: true) + ) uri = fd.resolve + return blank_onebox if fd.status == :blocked_page + if fd.status != :resolved args = { link: url } if fd.status == :invalid_address @@ -416,7 +421,7 @@ module Oneboxer return error_box end - return blank_onebox if uri.blank? || Onebox::DomainChecker.is_blocked?(uri.hostname) + return blank_onebox if uri.blank? onebox_options = { max_width: 695, diff --git a/lib/retrieve_title.rb b/lib/retrieve_title.rb index 74ad06c462b..1eb4f182a79 100644 --- a/lib/retrieve_title.rb +++ b/lib/retrieve_title.rb @@ -5,8 +5,9 @@ module RetrieveTitle def self.crawl(url) fetch_title(url) - rescue Exception - # If there was a connection error, do nothing + rescue Exception => ex + raise if Rails.env.test? + Rails.logger.error(ex) end def self.extract_title(html, encoding = nil) @@ -52,17 +53,13 @@ module RetrieveTitle # Fetch the beginning of a HTML document at a url def self.fetch_title(url) - fd = FinalDestination.new(url, timeout: CRAWL_TIMEOUT) + fd = FinalDestination.new(url, timeout: CRAWL_TIMEOUT, stop_at_blocked_pages: true) current = nil title = nil encoding = nil fd.get do |_response, chunk, uri| - if (uri.present? && Onebox::DomainChecker.is_blocked?(uri.hostname)) - throw :done - end - unless Net::HTTPRedirection === _response if current current << chunk diff --git a/spec/lib/inline_oneboxer_spec.rb b/spec/lib/inline_oneboxer_spec.rb index 9d0c4a67892..b7341e88494 100644 --- a/spec/lib/inline_oneboxer_spec.rb +++ b/spec/lib/inline_oneboxer_spec.rb @@ -29,34 +29,19 @@ describe InlineOneboxer do it "puts an entry in the cache" do SiteSetting.enable_inline_onebox_on_all_domains = true - url = "https://example.com/random-url" + url = "https://example.com/good-url" stub_request(:get, url).to_return(status: 200, body: "a blog") InlineOneboxer.invalidate(url) expect(InlineOneboxer.cache_lookup(url)).to be_blank result = InlineOneboxer.lookup(url) - expect(result).to be_present + expect(result[:title]).to be_present cached = InlineOneboxer.cache_lookup(url) expect(cached[:url]).to eq(url) expect(cached[:title]).to eq('a blog') end - - it "puts an entry in the cache for failed onebox" do - SiteSetting.enable_inline_onebox_on_all_domains = true - url = "https://example.com/random-url" - - InlineOneboxer.invalidate(url) - expect(InlineOneboxer.cache_lookup(url)).to be_blank - - result = InlineOneboxer.lookup(url) - expect(result).to be_present - - cached = InlineOneboxer.cache_lookup(url) - expect(cached[:url]).to eq(url) - expect(cached[:title]).to be_nil - end end context ".lookup" do @@ -199,6 +184,11 @@ describe InlineOneboxer do describe "lookups for blocked domains in the hostname" do shared_examples "blocks the domain" do |setting, domain_to_test| it "does not retrieve title" do + stub_request(:get, domain_to_test) + .to_return( + status: 200, + body: "hello world", + ) SiteSetting.blocked_onebox_domains = setting onebox = InlineOneboxer.lookup(domain_to_test, skip_cache: true) @@ -209,11 +199,16 @@ describe InlineOneboxer do shared_examples "does not fulfil blocked domain" do |setting, domain_to_test| it "retrieves title" do + stub_request(:get, domain_to_test) + .to_return( + status: 200, + body: "hello world", + ) SiteSetting.blocked_onebox_domains = setting onebox = InlineOneboxer.lookup(domain_to_test, skip_cache: true) - expect(onebox).to be_present + expect(onebox[:title]).to be_present end end @@ -228,6 +223,47 @@ describe InlineOneboxer do include_examples "does not fulfil blocked domain", "kitten.cloud", "https://cat.2kitten.cloud" include_examples "does not fulfil blocked domain", "kitten.cloud", "https://cat.kitten.cloud9" include_examples "does not fulfil blocked domain", "api.cat.org", "https://api-cat.org" + + it "doesn't retrieve title if a blocked domain is encountered anywhere in the redirect chain" do + SiteSetting.blocked_onebox_domains = "redirect.com" + stub_request(:get, "https://mainwebsite.com/blah") + .to_return(status: 301, body: "", headers: { "location" => "https://redirect.com/blah" }) + stub_request(:get, "https://redirect.com/blah") + .to_return(status: 301, body: "", headers: { "location" => "https://finalwebsite.com/blah" }) + stub_request(:get, "https://finalwebsite.com/blah") + .to_return(status: 200, body: "hello world") + onebox = InlineOneboxer.lookup("https://mainwebsite.com/blah", skip_cache: true) + + expect(onebox[:title]).to be_blank + end + + it "doesn't retrieve title if the Discourse-No-Onebox header == 1" do + stub_request(:get, "https://mainwebsite.com/blah") + .to_return( + status: 200, + body: "hello world", + headers: { "Discourse-No-Onebox" => "1" } + ) + onebox = InlineOneboxer.lookup("https://mainwebsite.com/blah", skip_cache: true) + expect(onebox[:title]).to be_blank + + stub_request(:get, "https://mainwebsite.com/blah/2") + .to_return( + status: 301, + body: "", + headers: { "location" => "https://mainwebsite.com/blah/2/redirect" } + ) + stub_request(:get, "https://mainwebsite.com/blah/2/redirect") + .to_return( + status: 301, + body: "", + headers: { "Discourse-No-Onebox" => "1", "location" => "https://somethingdoesnotmatter.com" } + ) + onebox = InlineOneboxer.lookup("https://mainwebsite.com/blah/2", skip_cache: true) + expect(onebox[:title]).to be_blank + onebox = InlineOneboxer.lookup("https://mainwebsite.com/blah/2/redirect", skip_cache: true) + expect(onebox[:title]).to be_blank + end end end end diff --git a/spec/lib/oneboxer_spec.rb b/spec/lib/oneboxer_spec.rb index 395c3e40342..bf17e6edb53 100644 --- a/spec/lib/oneboxer_spec.rb +++ b/spec/lib/oneboxer_spec.rb @@ -182,11 +182,16 @@ describe Oneboxer do stub_request(:get, "https://kitten.com").to_return(status: 200, body: html, headers: {}) stub_request(:head, "https://kitten.com").to_return(status: 200, body: "", headers: {}) - expect(Oneboxer.external_onebox("http://cat.com/meow")[:onebox]).to be_empty - expect(Oneboxer.external_onebox("https://kitten.com")[:onebox]).to be_empty + result = Oneboxer.external_onebox("http://cat.com/meow") + expect(result[:onebox]).to be_empty + expect(result[:preview]).to be_empty + + result = Oneboxer.external_onebox("http://kitten.com") + expect(result[:onebox]).to be_empty + expect(result[:preview]).to be_empty end - it "returns onebox if 'midway redirect' is blocked but final redirect uri is not blocked" do + it "does not return onebox if anything in the redirect chain is blocked" do SiteSetting.blocked_onebox_domains = "middle.com" stub_request(:get, "https://cat.com/start").to_return(status: 301, body: "a", headers: { "location" => "https://middle.com/midway" }) @@ -197,7 +202,44 @@ describe Oneboxer do stub_request(:get, "https://cat.com/end").to_return(status: 200, body: html) stub_request(:head, "https://cat.com/end").to_return(status: 200, body: "", headers: {}) - expect(Oneboxer.external_onebox("https://cat.com/start")[:onebox]).to be_present + result = Oneboxer.external_onebox("https://cat.com/start") + expect(result[:onebox]).to be_empty + expect(result[:preview]).to be_empty + end + + it "does not return onebox if the Discourse-No-Onebox header == 1" do + stub_request(:get, "https://website.com/discourse-no-onebox") + .to_return(status: 200, body: "abc", headers: { "Discourse-No-Onebox" => "1" }) + stub_request(:head, "https://website.com/discourse-no-onebox") + .to_return(status: 200, body: "", headers: { "Discourse-No-Onebox" => "1" }) + + result = Oneboxer.external_onebox("https://website.com/discourse-no-onebox") + expect(result[:onebox]).to be_empty + expect(result[:preview]).to be_empty + end + + it "does not return onebox if the Discourse-No-Onebox header == 1 anywhere in the redirect chain" do + stub_request(:get, "https://website.com/redirect-no-onebox") + .to_return(status: 301, body: "", headers: { "Discourse-No-Onebox" => "1", "location" => "https://willneverreach.com" }) + stub_request(:head, "https://website.com/redirect-no-onebox") + .to_return(status: 301, body: "", headers: { "Discourse-No-Onebox" => "1", "location" => "https://willneverreach.com" }) + + result = Oneboxer.external_onebox("https://website.com/redirect-no-onebox") + expect(result[:onebox]).to be_empty + expect(result[:preview]).to be_empty + + stub_request(:get, "https://website.com/redirect") + .to_return(status: 301, body: "", headers: { "location" => "https://website.com/redirect/dont-onebox" }) + stub_request(:head, "https://website.com/redirect") + .to_return(status: 301, body: "", headers: { "location" => "https://website.com/redirect/dont-onebox" }) + stub_request(:get, "https://website.com/redirect/dont-onebox") + .to_return(status: 301, body: "", headers: { "Discourse-No-Onebox" => "1", "location" => "https://wontreachme.com" }) + stub_request(:head, "https://website.com/redirect/dont-onebox") + .to_return(status: 301, body: "", headers: { "Discourse-No-Onebox" => "1", "location" => "https://wontreachme.com" }) + + result = Oneboxer.external_onebox("https://website.com/redirect") + expect(result[:onebox]).to be_empty + expect(result[:preview]).to be_empty end end diff --git a/spec/lib/retrieve_title_spec.rb b/spec/lib/retrieve_title_spec.rb index 62173952097..1b554123083 100644 --- a/spec/lib/retrieve_title_spec.rb +++ b/spec/lib/retrieve_title_spec.rb @@ -111,7 +111,7 @@ describe RetrieveTitle do expect(RetrieveTitle.crawl("http://foobar.com/amazing")).to eq(nil) end - it "returns title if 'midway redirect' is blocked but final redirect uri is not blocked" do + it "doesn't return title if a blocked domain is encountered anywhere in the redirect chain" do SiteSetting.blocked_onebox_domains = "wikipedia.com" stub_request(:get, "http://foobar.com/amazing") @@ -123,7 +123,18 @@ describe RetrieveTitle do stub_request(:get, "https://cat.com/meow") .to_return(status: 200, body: "very amazing", headers: {}) - expect(RetrieveTitle.crawl("http://foobar.com/amazing")).to eq("very amazing") + expect(RetrieveTitle.crawl("http://foobar.com/amazing")).to be_blank + end + + it "doesn't return title if the Discourse-No-Onebox header == 1" do + stub_request(:get, "https://cat.com/meow/no-onebox") + .to_return( + status: 200, + body: "discourse stay away", + headers: { "Discourse-No-Onebox" => "1" } + ) + + expect(RetrieveTitle.crawl("https://cat.com/meow/no-onebox")).to be_blank end end