mirror of
https://github.com/discourse/discourse.git
synced 2025-06-04 23:36:11 +08:00
FIX: crawler requests not tracked for non UTF-8 user agents
Non UTF-8 user_agent requests were bypassing logging due to PG always wanting UTF-8 strings. This adds some conversion to ensure we are always dealing with UTF-8
This commit is contained in:
@ -117,7 +117,12 @@ class Middleware::RequestTracker
|
|||||||
}
|
}
|
||||||
|
|
||||||
if h[:is_crawler]
|
if h[:is_crawler]
|
||||||
h[:user_agent] = env['HTTP_USER_AGENT']
|
user_agent = env['HTTP_USER_AGENT']
|
||||||
|
if user_agent.encoding != Encoding::UTF_8
|
||||||
|
user_agent = user_agent.encode("utf-8")
|
||||||
|
user_agent.scrub!
|
||||||
|
end
|
||||||
|
h[:user_agent] = user_agent
|
||||||
end
|
end
|
||||||
|
|
||||||
if cache = headers["X-Discourse-Cached"]
|
if cache = headers["X-Discourse-Cached"]
|
||||||
|
@ -15,6 +15,26 @@ describe Middleware::RequestTracker do
|
|||||||
}.merge(opts)
|
}.merge(opts)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
context "full request" do
|
||||||
|
before do
|
||||||
|
@orig = WebCrawlerRequest.autoflush
|
||||||
|
WebCrawlerRequest.autoflush = 1
|
||||||
|
end
|
||||||
|
after do
|
||||||
|
WebCrawlerRequest.autoflush = @orig
|
||||||
|
end
|
||||||
|
|
||||||
|
it "can handle rogue user agents" do
|
||||||
|
agent = (+"Evil Googlebot String \xc3\x28").force_encoding("Windows-1252")
|
||||||
|
|
||||||
|
middleware = Middleware::RequestTracker.new(->(env) { ["200", { "Content-Type" => "text/html" }, [""]] })
|
||||||
|
middleware.call(env("HTTP_USER_AGENT" => agent))
|
||||||
|
|
||||||
|
expect(WebCrawlerRequest.where(user_agent: agent.encode('utf-8')).count).to eq(1)
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
||||||
|
|
||||||
context "log_request" do
|
context "log_request" do
|
||||||
before do
|
before do
|
||||||
freeze_time Time.now
|
freeze_time Time.now
|
||||||
|
Reference in New Issue
Block a user