FIX: crawler requests exceptions for non UTF-8 user agents with invalid bytes

This commit is contained in:
Arkshine
2024-05-24 03:49:17 +02:00
committed by Régis Hanol
parent b757275c1e
commit 1fffb236b2
5 changed files with 86 additions and 13 deletions

View File

@ -112,6 +112,18 @@ RSpec.describe Middleware::AnonymousCache do
expect(key1).not_to eq(key2)
end
it "handles user agents with invalid bytes" do
agent = (+"Evil Googlebot String \xc3\x28").force_encoding("ASCII")
expect {
key1 = new_helper("HTTP_USER_AGENT" => agent).cache_key
key2 =
new_helper(
"HTTP_USER_AGENT" => agent.encode("utf-8", invalid: :replace, undef: :replace),
).cache_key
expect(key1).to eq(key2)
}.not_to raise_error
end
context "when cached" do
let!(:helper) { new_helper("ANON_CACHE_DURATION" => 10) }
@ -351,6 +363,15 @@ RSpec.describe Middleware::AnonymousCache do
expect(@status).to eq(403)
expect {
get "/",
headers: {
"HTTP_USER_AGENT" => (+"Evil Googlebot String \xc3\x28").force_encoding("ASCII"),
}
expect(@status).to eq(403)
}.not_to raise_error
get "/",
headers: {
"HTTP_USER_AGENT" => "Twitterbot/2.1 (+http://www.notgoogle.com/bot.html)",

View File

@ -38,6 +38,46 @@ RSpec.describe Middleware::RequestTracker do
expect(WebCrawlerRequest.where(user_agent: agent.encode("utf-8")).count).to eq(1)
end
it "can handle rogue user agents with invalid bytes sequences" do
agent = (+"Evil Googlebot String \xc3\x28").force_encoding("ASCII") # encode("utf-8") -> InvalidByteSequenceError
expect {
middleware =
Middleware::RequestTracker.new(
->(env) { ["200", { "Content-Type" => "text/html" }, [""]] },
)
middleware.call(env("HTTP_USER_AGENT" => agent))
CachedCounting.flush
expect(
WebCrawlerRequest.where(
user_agent: agent.encode("utf-8", invalid: :replace, undef: :replace),
).count,
).to eq(1)
}.not_to raise_error
end
it "can handle rogue user agents with undefined characters in the destination encoding" do
agent = (+"Evil Googlebot String \xc3\x28").force_encoding("ASCII-8BIT") # encode("utf-8") -> UndefinedConversionError
expect {
middleware =
Middleware::RequestTracker.new(
->(env) { ["200", { "Content-Type" => "text/html" }, [""]] },
)
middleware.call(env("HTTP_USER_AGENT" => agent))
CachedCounting.flush
expect(
WebCrawlerRequest.where(
user_agent: agent.encode("utf-8", invalid: :replace, undef: :replace),
).count,
).to eq(1)
}.not_to raise_error
end
end
describe "log_request" do