mirror of
https://github.com/discourse/discourse.git
synced 2025-05-23 21:31:04 +08:00
FIX: Attempt to onebox even if response body exceeds max_download_kb
(#26929)
In 95a82d608d6377faf68a0e2c5d9640b043557852, we lowered the default for `Onebox.options.max_download_kb` from 10mb to 2mb for security hardening purposes. However, this resulted in multiple bug reports where seemingly nomral URLs stopped being oneboxed. It turns out that lowering `Onebox.options.max_download_kb` resulted in `Onebox::Helpers::DownloadTooLarge` being raised more often for more URLs in `Onebox::Helpers.fetch_response` which `Onebox::Helpers.fetch_html_doc` relies on. When `Onebox::Helpers::DownloadTooLarge` is raised in `Onebox::Helpers.fetch_response`, we throw away whatever response body which we have already downloaded at that point. This is not ideal because Nokogiri can parse incomplete HTML documents and there is a really high chance that the incomplete HTML document still contains the information which we need for oneboxing. Therefore, this commit updates `Onebox::Helpers.fetch_html_doc` to not throw away the response body when the size of the response body exceeds `Onebox.options.max_download_size`. Instead, we just take whatever response which we have and get Nokogiri to parse it.
This commit is contained in:

committed by
GitHub

parent
13334a3da0
commit
c8da2a33e8
@ -13,15 +13,20 @@ module Onebox
|
|||||||
html.gsub(/<[^>]+>/, " ").gsub(/\n/, "")
|
html.gsub(/<[^>]+>/, " ").gsub(/\n/, "")
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# Fetches the HTML response body for a URL.
|
||||||
|
#
|
||||||
|
# Note that the size of the response body is capped at `Onebox.options.max_download_kb`. When the limit has been reached,
|
||||||
|
# this method will return the response body that has been downloaded up to the limit.
|
||||||
def self.fetch_html_doc(url, headers = nil, body_cacher = nil)
|
def self.fetch_html_doc(url, headers = nil, body_cacher = nil)
|
||||||
response =
|
response =
|
||||||
(
|
(
|
||||||
begin
|
begin
|
||||||
fetch_response(url, headers: headers, body_cacher: body_cacher)
|
fetch_response(url, headers:, body_cacher:, raise_error_when_response_too_large: false)
|
||||||
rescue StandardError
|
rescue StandardError
|
||||||
nil
|
nil
|
||||||
end
|
end
|
||||||
)
|
)
|
||||||
|
|
||||||
doc = Nokogiri.HTML(response)
|
doc = Nokogiri.HTML(response)
|
||||||
uri = Addressable::URI.parse(url)
|
uri = Addressable::URI.parse(url)
|
||||||
|
|
||||||
@ -45,7 +50,12 @@ module Onebox
|
|||||||
response =
|
response =
|
||||||
(
|
(
|
||||||
begin
|
begin
|
||||||
fetch_response(uri.to_s, headers: headers, body_cacher: body_cacher)
|
fetch_response(
|
||||||
|
uri.to_s,
|
||||||
|
headers:,
|
||||||
|
body_cacher:,
|
||||||
|
raise_error_when_response_too_large: false,
|
||||||
|
)
|
||||||
rescue StandardError
|
rescue StandardError
|
||||||
nil
|
nil
|
||||||
end
|
end
|
||||||
@ -63,7 +73,8 @@ module Onebox
|
|||||||
redirect_limit: 5,
|
redirect_limit: 5,
|
||||||
domain: nil,
|
domain: nil,
|
||||||
headers: nil,
|
headers: nil,
|
||||||
body_cacher: nil
|
body_cacher: nil,
|
||||||
|
raise_error_when_response_too_large: true
|
||||||
)
|
)
|
||||||
redirect_limit = Onebox.options.redirect_limit if redirect_limit >
|
redirect_limit = Onebox.options.redirect_limit if redirect_limit >
|
||||||
Onebox.options.redirect_limit
|
Onebox.options.redirect_limit
|
||||||
@ -125,7 +136,11 @@ module Onebox
|
|||||||
|
|
||||||
response.read_body do |chunk|
|
response.read_body do |chunk|
|
||||||
result.write(chunk)
|
result.write(chunk)
|
||||||
raise DownloadTooLarge.new if result.size > size_bytes
|
|
||||||
|
if result.size > size_bytes
|
||||||
|
raise_error_when_response_too_large ? raise(DownloadTooLarge.new) : break
|
||||||
|
end
|
||||||
|
|
||||||
raise Timeout::Error.new if (Time.now - start_time) > Onebox.options.timeout
|
raise Timeout::Error.new if (Time.now - start_time) > Onebox.options.timeout
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -16,6 +16,7 @@ RSpec.describe Onebox::Helpers do
|
|||||||
around do |example|
|
around do |example|
|
||||||
previous_options = Onebox.options.to_h
|
previous_options = Onebox.options.to_h
|
||||||
Onebox.options = { max_download_kb: 1 }
|
Onebox.options = { max_download_kb: 1 }
|
||||||
|
|
||||||
stub_request(:get, "http://example.com/large-file").to_return(
|
stub_request(:get, "http://example.com/large-file").to_return(
|
||||||
status: 200,
|
status: 200,
|
||||||
body: onebox_response("slides"),
|
body: onebox_response("slides"),
|
||||||
@ -32,6 +33,15 @@ RSpec.describe Onebox::Helpers do
|
|||||||
)
|
)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
it "returns the body of the response when size of response body exceeds the limit and `raise_error_when_response_too_large` has been set to `false`" do
|
||||||
|
expect(
|
||||||
|
described_class.fetch_response(
|
||||||
|
"http://example.com/large-file",
|
||||||
|
raise_error_when_response_too_large: false,
|
||||||
|
),
|
||||||
|
).to eq(onebox_response("slides"))
|
||||||
|
end
|
||||||
|
|
||||||
it "raises an exception when private url requested" do
|
it "raises an exception when private url requested" do
|
||||||
FinalDestination::TestHelper.stub_to_fail do
|
FinalDestination::TestHelper.stub_to_fail do
|
||||||
expect { described_class.fetch_response("http://example.com/large-file") }.to raise_error(
|
expect { described_class.fetch_response("http://example.com/large-file") }.to raise_error(
|
||||||
@ -49,6 +59,22 @@ RSpec.describe Onebox::Helpers do
|
|||||||
expect(described_class.fetch_html_doc(uri).to_s).to match("success")
|
expect(described_class.fetch_html_doc(uri).to_s).to match("success")
|
||||||
end
|
end
|
||||||
|
|
||||||
|
it "does not raise an error when response body exceeds Onebox's `max_download_kb` limit" do
|
||||||
|
previous_options = Onebox.options.to_h
|
||||||
|
Onebox.options = previous_options.merge(max_download_kb: 1)
|
||||||
|
|
||||||
|
stub_request(:get, "http://example.com/large-file").to_return(
|
||||||
|
status: 200,
|
||||||
|
body: onebox_response("slides"),
|
||||||
|
)
|
||||||
|
|
||||||
|
expect(described_class.fetch_html_doc("http://example.com/large-file").to_s).to include(
|
||||||
|
"ECMAScript 2015 by David Leonard",
|
||||||
|
)
|
||||||
|
ensure
|
||||||
|
Onebox.options = previous_options
|
||||||
|
end
|
||||||
|
|
||||||
context "with canonical link" do
|
context "with canonical link" do
|
||||||
it "follows canonical link" do
|
it "follows canonical link" do
|
||||||
uri = "https://www.example.com"
|
uri = "https://www.example.com"
|
||||||
|
Reference in New Issue
Block a user