Only block domains at the final destination (#15689)

In an earlier PR, we decided that we only want to block a domain if 
the blocked domain in the SiteSetting is the final destination (/t/59305). That 
PR used `FinalDestination#get`. `resolve` however is used several places
 but blocks domains along the redirect chain when certain options are provided.

This commit changes the default options for `resolve` to not do that. Existing
users of `FinalDestination#resolve` are
- `Oneboxer#external_onebox`
- our onebox helper `fetch_html_doc`, which is used in amazon, standard embed 
and youtube
  - these folks already go through `Oneboxer#external_onebox` which already
  blocks correctly
This commit is contained in:
Natalie Tay
2022-01-31 15:35:12 +08:00
committed by GitHub
parent 30454b3f27
commit aac9f43038
8 changed files with 101 additions and 40 deletions

View File

@ -44,9 +44,9 @@ class FinalDestination
@opts[:max_redirects] ||= 5 @opts[:max_redirects] ||= 5
@opts[:lookup_ip] ||= lambda { |host| FinalDestination.lookup_ip(host) } @opts[:lookup_ip] ||= lambda { |host| FinalDestination.lookup_ip(host) }
@ignored = @opts[:ignore_hostnames] || []
@limit = @opts[:max_redirects] @limit = @opts[:max_redirects]
@ignored = []
if @limit > 0 if @limit > 0
ignore_redirects = [Discourse.base_url_no_prefix] ignore_redirects = [Discourse.base_url_no_prefix]

View File

@ -61,7 +61,7 @@ class InlineOneboxer
if uri.present? && if uri.present? &&
uri.hostname.present? && uri.hostname.present? &&
(always_allow || allowed_domains.include?(uri.hostname)) && (always_allow || allowed_domains.include?(uri.hostname)) &&
!domain_is_blocked?(uri.hostname) !Onebox::DomainChecker.is_blocked?(uri.hostname)
title = RetrieveTitle.crawl(url) title = RetrieveTitle.crawl(url)
title = nil if title && title.length < MIN_TITLE_LENGTH title = nil if title && title.length < MIN_TITLE_LENGTH
return onebox_for(url, title, opts) return onebox_for(url, title, opts)
@ -73,12 +73,6 @@ class InlineOneboxer
private private
def self.domain_is_blocked?(hostname)
SiteSetting.blocked_onebox_domains&.split('|').any? do |blocked|
hostname == blocked || hostname.end_with?(".#{blocked}")
end
end
def self.onebox_for(url, title, opts) def self.onebox_for(url, title, opts)
title = title && Emoji.gsub_emoji_to_unicode(title) title = title && Emoji.gsub_emoji_to_unicode(title)
if title && opts[:post_number] if title && opts[:post_number]

View File

@ -0,0 +1,11 @@
# frozen_string_literal: true
module Onebox
class DomainChecker
def self.is_blocked?(hostname)
SiteSetting.blocked_onebox_domains&.split('|').any? do |blocked|
hostname == blocked || hostname.end_with?(".#{blocked}")
end
end
end
end

View File

@ -379,10 +379,6 @@ module Oneboxer
end end
end end
def self.blocked_domains
SiteSetting.blocked_onebox_domains.split("|")
end
def self.preserve_fragment_url_hosts def self.preserve_fragment_url_hosts
@preserve_fragment_url_hosts ||= ['http://github.com'] @preserve_fragment_url_hosts ||= ['http://github.com']
end end
@ -420,7 +416,7 @@ module Oneboxer
return error_box return error_box
end end
return blank_onebox if uri.blank? || blocked_domains.any? { |hostname| uri.hostname.match?(hostname) } return blank_onebox if uri.blank? || Onebox::DomainChecker.is_blocked?(uri.hostname)
onebox_options = { onebox_options = {
max_width: 695, max_width: 695,
@ -538,7 +534,6 @@ module Oneboxer
def self.get_final_destination_options(url, strategy = nil) def self.get_final_destination_options(url, strategy = nil)
fd_options = { fd_options = {
ignore_redirects: ignore_redirects, ignore_redirects: ignore_redirects,
ignore_hostnames: blocked_domains,
force_get_hosts: force_get_hosts, force_get_hosts: force_get_hosts,
force_custom_user_agent_hosts: force_custom_user_agent_hosts, force_custom_user_agent_hosts: force_custom_user_agent_hosts,
preserve_fragment_url_hosts: preserve_fragment_url_hosts, preserve_fragment_url_hosts: preserve_fragment_url_hosts,

View File

@ -60,7 +60,7 @@ module RetrieveTitle
encoding = nil encoding = nil
fd.get do |_response, chunk, uri| fd.get do |_response, chunk, uri|
if (uri.present? && InlineOneboxer.domain_is_blocked?(uri.hostname)) if (uri.present? && Onebox::DomainChecker.is_blocked?(uri.hostname))
throw :done throw :done
end end

View File

@ -160,28 +160,54 @@ describe Oneboxer do
end end
end end
it "does not crawl blocklisted URLs" do context ".external_onebox" do
SiteSetting.blocked_onebox_domains = "git.*.com|bitbucket.com" html = <<~HTML
url = 'https://github.com/discourse/discourse/commit/21b562852885f883be43032e03c709241e8e6d4f' <html>
stub_request(:head, 'https://discourse.org/').to_return(status: 302, body: "", headers: { location: url }) <head>
<meta property="og:title" content="Cats">
<meta property="og:description" content="Meow">
</head>
<body>
<p>body</p>
</body>
<html>
HTML
expect(Oneboxer.external_onebox(url)[:onebox]).to be_empty context "blacklisted domains" do
expect(Oneboxer.external_onebox('https://discourse.org/')[:onebox]).to be_empty
end
it "does not consider ignore_redirects domains as blocklisted" do it "does not return a onebox if redirect uri final destination is in blacklist" do
url = 'https://store.steampowered.com/app/271590/Grand_Theft_Auto_V/' SiteSetting.blocked_onebox_domains = "kitten.com"
stub_request(:head, url).to_return(status: 200, body: "", headers: {})
stub_request(:get, url).to_return(status: 200, body: "", headers: {})
expect(Oneboxer.external_onebox(url)[:onebox]).to be_present stub_request(:get, "http://cat.com/meow").to_return(status: 301, body: "", headers: { "location" => "https://kitten.com" })
end stub_request(:head, "http://cat.com/meow").to_return(status: 301, body: "", headers: { "location" => "https://kitten.com" })
it "censors external oneboxes" do stub_request(:get, "https://kitten.com").to_return(status: 200, body: html, headers: {})
Fabricate(:watched_word, action: WatchedWord.actions[:censor], word: "bad word") stub_request(:head, "https://kitten.com").to_return(status: 200, body: "", headers: {})
url = 'https://example.com/' expect(Oneboxer.external_onebox("http://cat.com/meow")[:onebox]).to be_empty
stub_request(:any, url).to_return(status: 200, body: <<~HTML, headers: {}) expect(Oneboxer.external_onebox("https://kitten.com")[:onebox]).to be_empty
end
it "returns onebox if 'midway redirect' is blocked but final redirect uri is not blocked" do
SiteSetting.blocked_onebox_domains = "middle.com"
stub_request(:get, "https://cat.com/start").to_return(status: 301, body: "a", headers: { "location" => "https://middle.com/midway" })
stub_request(:head, "https://cat.com/start").to_return(status: 301, body: "a", headers: { "location" => "https://middle.com/midway" })
stub_request(:head, "https://middle.com/midway").to_return(status: 301, body: "b", headers: { "location" => "https://cat.com/end" })
stub_request(:get, "https://cat.com/end").to_return(status: 200, body: html)
stub_request(:head, "https://cat.com/end").to_return(status: 200, body: "", headers: {})
expect(Oneboxer.external_onebox("https://cat.com/start")[:onebox]).to be_present
end
end
it "censors external oneboxes" do
Fabricate(:watched_word, action: WatchedWord.actions[:censor], word: "bad word")
url = 'https://example.com/'
stub_request(:any, url).to_return(status: 200, body: <<~HTML, headers: {})
<html> <html>
<head> <head>
<meta property="og:title" content="title with bad word"> <meta property="og:title" content="title with bad word">
@ -191,13 +217,23 @@ describe Oneboxer do
<p>content with bad word</p> <p>content with bad word</p>
</body> </body>
<html> <html>
HTML HTML
onebox = Oneboxer.external_onebox(url) onebox = Oneboxer.external_onebox(url)
expect(onebox[:onebox]).to include('title with') expect(onebox[:onebox]).to include('title with')
expect(onebox[:onebox]).not_to include('bad word') expect(onebox[:onebox]).not_to include('bad word')
expect(onebox[:preview]).to include('title with') expect(onebox[:preview]).to include('title with')
expect(onebox[:preview]).not_to include('bad word') expect(onebox[:preview]).not_to include('bad word')
end
it "returns onebox" do
SiteSetting.blocked_onebox_domains = "not.me"
stub_request(:get, "https://its.me").to_return(status: 200, body: html)
stub_request(:head, "https://its.me").to_return(status: 200, body: "", headers: {})
expect(Oneboxer.external_onebox("https://its.me")[:onebox]).to be_present
end
end end
it "uses the Onebox custom user agent on specified hosts" do it "uses the Onebox custom user agent on specified hosts" do

View File

@ -110,7 +110,6 @@ describe RetrieveTitle do
stub_request(:get, "https://wikipedia.com/amazing") stub_request(:get, "https://wikipedia.com/amazing")
.to_return(status: 200, body: "<html><title>very amazing</title>", headers: {}) .to_return(status: 200, body: "<html><title>very amazing</title>", headers: {})
IPSocket.stubs(:getaddress).returns('100.2.3.4')
expect(RetrieveTitle.crawl("http://foobar.com/amazing")).to eq(nil) expect(RetrieveTitle.crawl("http://foobar.com/amazing")).to eq(nil)
end end
@ -126,7 +125,6 @@ describe RetrieveTitle do
stub_request(:get, "https://cat.com/meow") stub_request(:get, "https://cat.com/meow")
.to_return(status: 200, body: "<html><title>very amazing</title>", headers: {}) .to_return(status: 200, body: "<html><title>very amazing</title>", headers: {})
IPSocket.stubs(:getaddress).returns('100.2.3.4')
expect(RetrieveTitle.crawl("http://foobar.com/amazing")).to eq("very amazing") expect(RetrieveTitle.crawl("http://foobar.com/amazing")).to eq("very amazing")
end end
end end

View File

@ -0,0 +1,27 @@
# frozen_string_literal: true
require "rails_helper"
describe Onebox::DomainChecker do
describe '.is_blocked?' do
before do
SiteSetting.blocked_onebox_domains = "api.cat.org|kitten.cloud"
end
describe "returns true when entirely matched" do
it { expect(described_class.is_blocked?("api.cat.org")).to be(true) }
it { expect(described_class.is_blocked?("kitten.cloud")).to be(true) }
it { expect(described_class.is_blocked?("api.dog.org")).to be(false) }
it { expect(described_class.is_blocked?("puppy.cloud")).to be(false) }
end
describe "returns true when ends with .<domain>" do
it { expect(described_class.is_blocked?("dev.api.cat.org")).to be(true) }
it { expect(described_class.is_blocked?(".api.cat.org")).to be(true) }
it { expect(described_class.is_blocked?("dev.kitten.cloud")).to be(true) }
it { expect(described_class.is_blocked?(".kitten.cloud")).to be(true) }
it { expect(described_class.is_blocked?("xapi.cat.org")).to be(false) }
it { expect(described_class.is_blocked?("xkitten.cloud")).to be(false) }
end
end
end