mirror of
https://github.com/discourse/discourse.git
synced 2025-06-24 04:31:33 +08:00

This change standardises the `User-Agent` header that Discourse will send when talking to other sites. `Discourse.user_agent` is now the authority on what the user agent value should be. For Onebox requests, this changes the user agent from their existing value to match the new value (unless overridden). For all other requests, `Net::HTTPHeader` is monkey-patched to add a default `User-Agent` header when one hasn't been provided.
308 lines
9.6 KiB
Ruby
308 lines
9.6 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
require "addressable"
|
|
|
|
module Onebox
|
|
module Helpers
|
|
class DownloadTooLarge < StandardError
|
|
end
|
|
|
|
IGNORE_CANONICAL_DOMAINS = %w[www.instagram.com medium.com youtube.com]
|
|
|
|
def self.clean(html)
|
|
html.gsub(/<[^>]+>/, " ").gsub(/\n/, "")
|
|
end
|
|
|
|
# Fetches the HTML response body for a URL.
|
|
#
|
|
# Note that the size of the response body is capped at `Onebox.options.max_download_kb`. When the limit has been reached,
|
|
# this method will return the response body that has been downloaded up to the limit.
|
|
def self.fetch_html_doc(url, headers = nil, body_cacher = nil)
|
|
response =
|
|
(
|
|
begin
|
|
fetch_response(url, headers:, body_cacher:, raise_error_when_response_too_large: false)
|
|
rescue StandardError
|
|
nil
|
|
end
|
|
)
|
|
|
|
doc = Nokogiri.HTML(response)
|
|
uri = Addressable::URI.parse(url).normalize!
|
|
|
|
ignore_canonical_tag = doc.at('meta[property="og:ignore_canonical"]')
|
|
should_ignore_canonical =
|
|
IGNORE_CANONICAL_DOMAINS.map { |hostname| uri.hostname.match?(hostname) }.any?
|
|
|
|
if !(ignore_canonical_tag && ignore_canonical_tag["content"].to_s == "true") &&
|
|
!should_ignore_canonical
|
|
# prefer canonical link
|
|
canonical_link = doc.at('//link[@rel="canonical"]/@href')
|
|
canonical_uri = Addressable::URI.parse(canonical_link)&.normalize!
|
|
if canonical_link && canonical_uri &&
|
|
"#{canonical_uri.host}#{canonical_uri.path}" != "#{uri.host}#{uri.path}"
|
|
uri =
|
|
FinalDestination.new(
|
|
canonical_uri,
|
|
Oneboxer.get_final_destination_options(canonical_uri),
|
|
).resolve
|
|
if uri.present?
|
|
response =
|
|
(
|
|
begin
|
|
fetch_response(
|
|
uri.to_s,
|
|
headers:,
|
|
body_cacher:,
|
|
raise_error_when_response_too_large: false,
|
|
)
|
|
rescue StandardError
|
|
nil
|
|
end
|
|
)
|
|
doc = Nokogiri.HTML(response) if response
|
|
end
|
|
end
|
|
end
|
|
|
|
doc
|
|
end
|
|
|
|
def self.fetch_response(
|
|
location,
|
|
redirect_limit: 5,
|
|
domain: nil,
|
|
headers: nil,
|
|
body_cacher: nil,
|
|
raise_error_when_response_too_large: true,
|
|
allow_cross_domain_cookies: false
|
|
)
|
|
redirect_limit = Onebox.options.redirect_limit if redirect_limit >
|
|
Onebox.options.redirect_limit
|
|
|
|
raise Net::HTTPError.new("HTTP redirect too deep", location) if redirect_limit == 0
|
|
|
|
uri = Addressable::URI.parse(location)
|
|
uri = Addressable::URI.join(domain, uri) if !uri.host
|
|
|
|
use_body_cacher = body_cacher && body_cacher.respond_to?("fetch_cached_response_body")
|
|
if use_body_cacher
|
|
response_body = body_cacher.fetch_cached_response_body(uri.to_s)
|
|
|
|
return response_body if response_body.present?
|
|
end
|
|
|
|
result = StringIO.new
|
|
FinalDestination::HTTP.start(
|
|
uri.host,
|
|
uri.port,
|
|
open_timeout: Onebox.options.connect_timeout,
|
|
use_ssl: uri.normalized_scheme == "https",
|
|
) do |http|
|
|
http.read_timeout = Onebox.options.timeout
|
|
http.verify_mode = OpenSSL::SSL::VERIFY_NONE # Work around path building bugs
|
|
|
|
headers ||= {}
|
|
|
|
headers["User-Agent"] ||= user_agent if user_agent
|
|
headers["Accept-Language"] ||= Oneboxer.accept_language
|
|
|
|
request = Net::HTTP::Get.new(uri.request_uri, headers)
|
|
start_time = Time.now
|
|
|
|
size_bytes = Onebox.options.max_download_kb * 1024
|
|
http.request(request) do |response|
|
|
if cookie = response.get_fields("set-cookie")
|
|
headers["Cookie"] = cookie.join("; ") if allow_cross_domain_cookies
|
|
# HACK: If this breaks again in the future, use HTTP::CookieJar from gem 'http-cookie'
|
|
# See test: it "does not send cookies to the wrong domain"
|
|
redir_header = { "Cookie" => cookie.join("; ") }
|
|
end
|
|
|
|
redir_header = nil unless redir_header.is_a? Hash
|
|
|
|
code = response.code.to_i
|
|
unless code === 200
|
|
response.error! if [301, 302, 303, 307, 308].exclude?(code)
|
|
|
|
return(
|
|
fetch_response(
|
|
response["location"],
|
|
redirect_limit: redirect_limit - 1,
|
|
domain: "#{uri.scheme}://#{uri.host}",
|
|
headers: allow_cross_domain_cookies ? headers : redir_header,
|
|
allow_cross_domain_cookies: allow_cross_domain_cookies,
|
|
)
|
|
)
|
|
end
|
|
|
|
response.read_body do |chunk|
|
|
result.write(chunk)
|
|
|
|
if result.size > size_bytes
|
|
raise_error_when_response_too_large ? raise(DownloadTooLarge.new) : break
|
|
end
|
|
|
|
raise Timeout::Error.new if (Time.now - start_time) > Onebox.options.timeout
|
|
end
|
|
|
|
if use_body_cacher && body_cacher.cache_response_body?(uri)
|
|
body_cacher.cache_response_body(uri.to_s, result.string)
|
|
end
|
|
|
|
return result.string
|
|
end
|
|
end
|
|
end
|
|
|
|
def self.fetch_content_length(location)
|
|
uri = URI(location)
|
|
|
|
FinalDestination::HTTP.start(
|
|
uri.host,
|
|
uri.port,
|
|
open_timeout: Onebox.options.connect_timeout,
|
|
use_ssl: uri.is_a?(URI::HTTPS),
|
|
) do |http|
|
|
http.read_timeout = Onebox.options.timeout
|
|
if uri.is_a?(URI::HTTPS)
|
|
http.use_ssl = true
|
|
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
|
end
|
|
|
|
http.request_head([uri.path, uri.query].join("?")) do |response|
|
|
return response.code.to_i == 200 ? response.content_length.presence : nil
|
|
end
|
|
end
|
|
end
|
|
|
|
def self.pretty_filesize(size)
|
|
conv = %w[B KB MB GB TB PB EB]
|
|
scale = 1024
|
|
|
|
ndx = 1
|
|
return "#{(size)} #{conv[ndx - 1]}" if (size < 2 * (scale**ndx))
|
|
size = size.to_f
|
|
[2, 3, 4, 5, 6, 7].each do |i|
|
|
return "#{"%.2f" % (size / (scale**(i - 1)))} #{conv[i - 1]}" if (size < 2 * (scale**i))
|
|
end
|
|
ndx = 7
|
|
"#{"%.2f" % (size / (scale**(ndx - 1)))} #{conv[ndx - 1]}"
|
|
end
|
|
|
|
def self.click_to_scroll_div(width = 690, height = 400)
|
|
"<div style=\"background:transparent;position:relative;width:#{width}px;height:#{height}px;top:#{height}px;margin-top:-#{height}px;\" onClick=\"style.pointerEvents='none'\"></div>"
|
|
end
|
|
|
|
def self.truncate(string, length = 50)
|
|
return string if string.nil?
|
|
string.size > length ? string[0...(string.rindex(" ", length) || length)] + "..." : string
|
|
end
|
|
|
|
def self.get(meta, attr)
|
|
(meta && meta[attr].present?) ? sanitize(meta[attr]) : nil
|
|
end
|
|
|
|
def self.sanitize(value, length = 50)
|
|
return nil if value.blank?
|
|
Sanitize.fragment(value).strip
|
|
end
|
|
|
|
def self.normalize_url_for_output(url)
|
|
return "" unless url
|
|
url = url.dup
|
|
# expect properly encoded url, remove any unsafe chars
|
|
url.gsub!(" ", "%20")
|
|
url.gsub!("'", "'")
|
|
url.gsub!('"', """)
|
|
url.gsub!(/[^\w\-`.~:\/?#\[\]@!$&'\(\)*+,;=%\p{M}’]/, "")
|
|
|
|
parsed = Addressable::URI.parse(url)
|
|
return "" unless parsed.host
|
|
|
|
url
|
|
end
|
|
|
|
def self.get_absolute_image_url(src, url)
|
|
begin
|
|
URI.parse(url).merge(src).to_s
|
|
rescue ArgumentError, URI::BadURIError, URI::InvalidURIError
|
|
src
|
|
end
|
|
end
|
|
|
|
def self.user_agent
|
|
if SiteSetting.onebox_user_agent.present?
|
|
return "#{SiteSetting.onebox_user_agent} v#{Discourse::VERSION::STRING}"
|
|
end
|
|
|
|
if Onebox.options.user_agent.present?
|
|
return "#{Onebox.options.user_agent} v#{Discourse::VERSION::STRING}"
|
|
end
|
|
Discourse.user_agent
|
|
end
|
|
|
|
# Percent-encodes a URI string per RFC3986 - https://tools.ietf.org/html/rfc3986
|
|
def self.uri_encode(url)
|
|
return "" unless url
|
|
|
|
uri = Addressable::URI.parse(url)
|
|
|
|
encoded_uri =
|
|
Addressable::URI.new(
|
|
scheme:
|
|
Addressable::URI.encode_component(
|
|
uri.scheme,
|
|
Addressable::URI::CharacterClasses::SCHEME,
|
|
),
|
|
authority:
|
|
Addressable::URI.encode_component(
|
|
uri.authority,
|
|
Addressable::URI::CharacterClasses::AUTHORITY,
|
|
),
|
|
path:
|
|
Addressable::URI.encode_component(
|
|
uri.path,
|
|
Addressable::URI::CharacterClasses::PATH + "\\%",
|
|
),
|
|
query:
|
|
Addressable::URI.encode_component(
|
|
uri.query,
|
|
"a-zA-Z0-9\\-\\.\\_\\~\\$\\&\\*\\,\\=\\:\\@\\?\\%",
|
|
),
|
|
fragment:
|
|
Addressable::URI.encode_component(
|
|
uri.fragment,
|
|
"a-zA-Z0-9\\-\\.\\_\\~\\!\\$\\&\\'\\(\\)\\*\\+\\,\\;\\=\\:\\/\\?\\%",
|
|
),
|
|
)
|
|
|
|
encoded_uri.to_s
|
|
end
|
|
|
|
def self.uri_unencode(url)
|
|
Addressable::URI.unencode(url)
|
|
end
|
|
|
|
def self.image_placeholder_html
|
|
"<div class='onebox-placeholder-container'><span class='placeholder-icon image'></span></div>"
|
|
end
|
|
|
|
def self.video_placeholder_html
|
|
"<div class='onebox-placeholder-container'><span class='placeholder-icon video'></span></div>"
|
|
end
|
|
|
|
def self.audio_placeholder_html
|
|
"<div class='onebox-placeholder-container'><span class='placeholder-icon audio'></span></div>"
|
|
end
|
|
|
|
def self.map_placeholder_html
|
|
"<div class='onebox-placeholder-container'><span class='placeholder-icon map'></span></div>"
|
|
end
|
|
|
|
def self.generic_placeholder_html
|
|
"<div class='onebox-placeholder-container'><span class='placeholder-icon generic'></span></div>"
|
|
end
|
|
end
|
|
end
|