FEATURE: Oneboxer cache response body (#12562)

* FEATURE: Cache successful HTTP GET requests during Oneboxing

Some oneboxes may fail if when making excessive and/or odd requests against the target domains. This change provides a simple mechanism to cache the results of succesful GET requests as part of the oneboxing process, with the goal of reducing repeated requests and ultimately improving the rate of successful oneboxing.

To enable:

Set `SiteSetting.cache_onebox_response_body` to `true`

Add the domains you’re interesting in caching to `SiteSetting. cache_onebox_response_body_domains` e.g. `example.com|example.org|example.net`

Optionally set `SiteSetting.cache_onebox_user_agent` to a user agent string of your choice to use when making requests against domains in the above list.

* FIX: Swap order of duration and value in redis call

The correct order for `setex` arguments is `key`, `duration`, and `value`.

Duration and value had been flipped, however the code would not have thrown an error because we were caching the value of `1.day.to_i` for a period of 1 seconds… The intention appears to be to set a value of 1 (purely as a flag) for a period of 1 day.
This commit is contained in:
jbrw
2021-03-31 13:19:34 -04:00
committed by GitHub
parent e704f0a541
commit 68d0916eb5
4 changed files with 130 additions and 21 deletions

View File

@ -1638,6 +1638,16 @@ onebox:
facebook_app_access_token: facebook_app_access_token:
default: "" default: ""
secret: true secret: true
cache_onebox_response_body:
default: false
hidden: true
cache_onebox_response_body_domains:
default: ""
type: list
hidden: true
cache_onebox_user_agent:
default: ""
hidden: true
spam: spam:
add_rel_nofollow_to_user_content: true add_rel_nofollow_to_user_content: true
hide_post_sensitivity: hide_post_sensitivity:

View File

@ -16,7 +16,7 @@ class FinalDestination
def self.cache_https_domain(domain) def self.cache_https_domain(domain)
key = redis_https_key(domain) key = redis_https_key(domain)
Discourse.redis.without_namespace.setex(key, "1", 1.day.to_i).present? Discourse.redis.without_namespace.setex(key, 1.day.to_i, "1")
end end
def self.is_https_domain?(domain) def self.is_https_domain?(domain)
@ -28,6 +28,8 @@ class FinalDestination
"HTTPS_DOMAIN_#{domain}" "HTTPS_DOMAIN_#{domain}"
end end
DEFAULT_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15"
attr_reader :status, :cookie, :status_code, :ignored attr_reader :status, :cookie, :status_code, :ignored
def initialize(url, opts = nil) def initialize(url, opts = nil)
@ -38,6 +40,7 @@ class FinalDestination
@force_get_hosts = @opts[:force_get_hosts] || [] @force_get_hosts = @opts[:force_get_hosts] || []
@preserve_fragment_url_hosts = @opts[:preserve_fragment_url_hosts] || [] @preserve_fragment_url_hosts = @opts[:preserve_fragment_url_hosts] || []
@force_custom_user_agent_hosts = @opts[:force_custom_user_agent_hosts] || [] @force_custom_user_agent_hosts = @opts[:force_custom_user_agent_hosts] || []
@default_user_agent = @opts[:default_user_agent] || DEFAULT_USER_AGENT
@opts[:max_redirects] ||= 5 @opts[:max_redirects] ||= 5
@opts[:lookup_ip] ||= lambda { |host| FinalDestination.lookup_ip(host) } @opts[:lookup_ip] ||= lambda { |host| FinalDestination.lookup_ip(host) }
@ -67,7 +70,7 @@ class FinalDestination
@timeout = @opts[:timeout] || nil @timeout = @opts[:timeout] || nil
@preserve_fragment_url = @preserve_fragment_url_hosts.any? { |host| hostname_matches?(host) } @preserve_fragment_url = @preserve_fragment_url_hosts.any? { |host| hostname_matches?(host) }
@validate_uri = @opts.fetch(:validate_uri) { true } @validate_uri = @opts.fetch(:validate_uri) { true }
@user_agent = @force_custom_user_agent_hosts.any? { |host| hostname_matches?(host) } ? Onebox.options.user_agent : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15" @user_agent = @force_custom_user_agent_hosts.any? { |host| hostname_matches?(host) } ? Onebox.options.user_agent : @default_user_agent
end end
def self.connection_timeout def self.connection_timeout
@ -182,20 +185,35 @@ class FinalDestination
end end
end end
if Oneboxer.cached_response_body_exists?(@uri.to_s)
@status = :resolved
return @uri
end
headers = request_headers headers = request_headers
middlewares = Excon.defaults[:middlewares]
middlewares << Excon::Middleware::Decompress if @http_verb == :get
response = Excon.public_send(@http_verb, response = Excon.public_send(@http_verb,
@uri.to_s, @uri.to_s,
read_timeout: timeout, read_timeout: timeout,
headers: headers headers: headers,
middlewares: middlewares
) )
location = nil location = nil
response_headers = nil response_headers = nil
response_status = response.status.to_i response_status = response.status.to_i
case response.status case response.status
when 200 when 200
# Cache body of successful `get` requests
if @http_verb == :get
if Oneboxer.cache_response_body?(@uri)
Oneboxer.cache_response_body(@uri.to_s, response.body)
end
end
@status = :resolved @status = :resolved
return @uri return @uri
when 400, 405, 406, 409, 501 when 400, 405, 406, 409, 501

View File

@ -26,13 +26,17 @@ module Oneboxer
@ignore_redirects ||= ['http://www.dropbox.com', 'http://store.steampowered.com', 'http://vimeo.com', Discourse.base_url] @ignore_redirects ||= ['http://www.dropbox.com', 'http://store.steampowered.com', 'http://vimeo.com', Discourse.base_url]
end end
def self.force_get_hosts def self.amazon_domains
@force_get_hosts ||= begin
hosts = ['http://us.battle.net', 'https://news.yahoo.com']
amazon_suffixes = %w(com com.br ca cn fr de in it co.jp com.mx nl pl sa sg es se com.tr ae co.uk) amazon_suffixes = %w(com com.br ca cn fr de in it co.jp com.mx nl pl sa sg es se com.tr ae co.uk)
amazon_suffixes.collect { |suffix| "https://www.amazon.#{suffix}" }
hosts + amazon_suffixes.collect { |suffix| "https://www.amazon.#{suffix}" }
end end
def self.force_get_hosts
hosts = ['http://us.battle.net', 'https://news.yahoo.com']
hosts += SiteSetting.cache_onebox_response_body_domains.split('|').collect { |domain| "https://www.#{domain}" }
hosts += amazon_domains
hosts.uniq
end end
def self.force_custom_user_agent_hosts def self.force_custom_user_agent_hosts
@ -80,6 +84,33 @@ module Oneboxer
Discourse.cache.delete(onebox_failed_cache_key(url)) Discourse.cache.delete(onebox_failed_cache_key(url))
end end
def self.cache_response_body?(uri)
uri = URI.parse(uri) if uri.is_a?(String)
if SiteSetting.cache_onebox_response_body?
SiteSetting.cache_onebox_response_body_domains.split("|").any? { |domain| uri.hostname.ends_with?(domain) }
end
end
def self.cache_response_body(uri, response)
key = redis_cached_response_body_key(uri)
Discourse.redis.without_namespace.setex(key, 1.minutes.to_i, response)
end
def self.cached_response_body_exists?(uri)
key = redis_cached_response_body_key(uri)
Discourse.redis.without_namespace.exists(key).to_i > 0
end
def self.fetch_cached_response_body(uri)
key = redis_cached_response_body_key(uri)
Discourse.redis.without_namespace.get(key)
end
def self.redis_cached_response_body_key(uri)
"CACHED_RESPONSE_#{uri}"
end
# Parse URLs out of HTML, returning the document when finished. # Parse URLs out of HTML, returning the document when finished.
def self.each_onebox_link(string_or_doc, extra_paths: []) def self.each_onebox_link(string_or_doc, extra_paths: [])
doc = string_or_doc doc = string_or_doc
@ -368,12 +399,18 @@ module Oneboxer
def self.external_onebox(url) def self.external_onebox(url)
Discourse.cache.fetch(onebox_cache_key(url), expires_in: 1.day) do Discourse.cache.fetch(onebox_cache_key(url), expires_in: 1.day) do
fd = FinalDestination.new(url, fd_options = {
ignore_redirects: ignore_redirects, ignore_redirects: ignore_redirects,
ignore_hostnames: blocked_domains, ignore_hostnames: blocked_domains,
force_get_hosts: force_get_hosts, force_get_hosts: force_get_hosts,
force_custom_user_agent_hosts: force_custom_user_agent_hosts, force_custom_user_agent_hosts: force_custom_user_agent_hosts,
preserve_fragment_url_hosts: preserve_fragment_url_hosts) preserve_fragment_url_hosts: preserve_fragment_url_hosts
}
user_agent_override = SiteSetting.cache_onebox_user_agent if Oneboxer.cache_response_body?(url) && SiteSetting.cache_onebox_user_agent.present?
fd_options[:default_user_agent] = user_agent_override if user_agent_override
fd = FinalDestination.new(url, fd_options)
uri = fd.resolve uri = fd.resolve
if fd.status != :resolved if fd.status != :resolved
@ -389,20 +426,22 @@ module Oneboxer
return error_box return error_box
end end
return blank_onebox if uri.blank? || blocked_domains.map { |hostname| uri.hostname.match?(hostname) }.any? return blank_onebox if uri.blank? || blocked_domains.any? { |hostname| uri.hostname.match?(hostname) }
options = { onebox_options = {
max_width: 695, max_width: 695,
sanitize_config: Onebox::DiscourseOneboxSanitizeConfig::Config::DISCOURSE_ONEBOX, sanitize_config: Onebox::DiscourseOneboxSanitizeConfig::Config::DISCOURSE_ONEBOX,
allowed_iframe_origins: allowed_iframe_origins, allowed_iframe_origins: allowed_iframe_origins,
hostname: GlobalSetting.hostname, hostname: GlobalSetting.hostname,
facebook_app_access_token: SiteSetting.facebook_app_access_token, facebook_app_access_token: SiteSetting.facebook_app_access_token,
disable_media_download_controls: SiteSetting.disable_onebox_media_download_controls disable_media_download_controls: SiteSetting.disable_onebox_media_download_controls,
body_cacher: self
} }
options[:cookie] = fd.cookie if fd.cookie onebox_options[:cookie] = fd.cookie if fd.cookie
onebox_options[:user_agent] = user_agent_override if user_agent_override
r = Onebox.preview(uri.to_s, options) r = Onebox.preview(uri.to_s, onebox_options)
result = { onebox: r.to_s, preview: r&.placeholder_html.to_s } result = { onebox: r.to_s, preview: r&.placeholder_html.to_s }
# NOTE: Call r.errors after calling placeholder_html # NOTE: Call r.errors after calling placeholder_html

View File

@ -330,9 +330,51 @@ describe Oneboxer do
end end
describe '#force_get_hosts' do describe '#force_get_hosts' do
before do
SiteSetting.cache_onebox_response_body_domains = "example.net|example.com|example.org"
end
it "includes Amazon sites" do it "includes Amazon sites" do
expect(Oneboxer.force_get_hosts).to include('https://www.amazon.ca') expect(Oneboxer.force_get_hosts).to include('https://www.amazon.ca')
end end
it "includes cache_onebox_response_body_domains" do
expect(Oneboxer.force_get_hosts).to include('https://www.example.com')
end
end
describe 'cache_onebox_response_body' do
let(:html) do
<<~HTML
<html>
<body>
<p>cache me if you can</p>
</body>
<html>
HTML
end
let(:url) { "https://www.example.com/my/great/content" }
let(:url2) { "https://www.example2.com/my/great/content" }
before do
stub_request(:any, url).to_return(status: 200, body: html)
stub_request(:any, url2).to_return(status: 200, body: html)
SiteSetting.cache_onebox_response_body = true
SiteSetting.cache_onebox_response_body_domains = "example.net|example.com|example.org"
end
it "caches when domain matches" do
preview = Oneboxer.preview(url, invalidate_oneboxes: true)
expect(Oneboxer.cached_response_body_exists?(url)).to eq(true)
expect(Oneboxer.fetch_cached_response_body(url)).to eq(html)
end
it "ignores cache when domain not present" do
preview = Oneboxer.preview(url2, invalidate_oneboxes: true)
expect(Oneboxer.cached_response_body_exists?(url2)).to eq(false)
end
end end
end end