FEATURE: Add experimental tracking of 'real browser' pageviews (#26647)

Our 'page_view_crawler' / 'page_view_anon' metrics are based purely on the User Agent sent by clients. This means that 'badly behaved' bots which are imitating real user agents are counted towards 'anon' page views.

This commit introduces a new method of tracking visitors. When an initial HTML request is made, we assume it is a 'non-browser' request (i.e. a bot). Then, once the JS application has booted, we notify the server to count it as a 'browser' request. This reliance on a JavaScript-capable browser matches up more closely to dedicated analytics systems like Google Analytics.

Existing data collection and graphs are unchanged. Data collected via the new technique is available in a new 'experimental' report.
This commit is contained in:
David Taylor
2024-04-25 11:00:01 +01:00
committed by GitHub
parent 52e8d57293
commit 2f2da72747
16 changed files with 412 additions and 7 deletions

View File

@ -74,9 +74,31 @@ class Middleware::RequestTracker
elsif data[:has_auth_cookie]
ApplicationRequest.increment!(:page_view_logged_in)
ApplicationRequest.increment!(:page_view_logged_in_mobile) if data[:is_mobile]
if data[:explicit_track_view]
# Must be a browser if it had this header from our ajax implementation
ApplicationRequest.increment!(:page_view_logged_in_browser)
ApplicationRequest.increment!(:page_view_logged_in_browser_mobile) if data[:is_mobile]
end
elsif !SiteSetting.login_required
ApplicationRequest.increment!(:page_view_anon)
ApplicationRequest.increment!(:page_view_anon_mobile) if data[:is_mobile]
if data[:explicit_track_view]
# Must be a browser if it had this header from our ajax implementation
ApplicationRequest.increment!(:page_view_anon_browser)
ApplicationRequest.increment!(:page_view_anon_browser_mobile) if data[:is_mobile]
end
end
end
# Message-bus requests may include this 'deferred track' header which we use to detect
# 'real browser' views.
if data[:deferred_track] && !data[:is_crawler]
if data[:has_auth_cookie]
ApplicationRequest.increment!(:page_view_logged_in_browser)
ApplicationRequest.increment!(:page_view_logged_in_browser_mobile) if data[:is_mobile]
elsif !SiteSetting.login_required
ApplicationRequest.increment!(:page_view_anon_browser)
ApplicationRequest.increment!(:page_view_anon_browser_mobile) if data[:is_mobile]
end
end
@ -103,12 +125,20 @@ class Middleware::RequestTracker
request ||= Rack::Request.new(env)
helper = Middleware::AnonymousCache::Helper.new(env, request)
# Value of the discourse-track-view request header
env_track_view = env["HTTP_DISCOURSE_TRACK_VIEW"]
track_view = status == 200
track_view &&= env_track_view != "0" && env_track_view != "false"
track_view &&=
env_track_view || (request.get? && !request.xhr? && headers["Content-Type"] =~ %r{text/html})
track_view = !!track_view
# Was the discourse-track-view request header set to true? Likely
# set by our ajax library to indicate a page view.
explicit_track_view = status == 200 && %w[1 true].include?(env_track_view)
# An HTML response to a GET request is tracked implicitly
implicit_track_view =
status == 200 && !%w[0 false].include?(env_track_view) && request.get? && !request.xhr? &&
headers["Content-Type"] =~ %r{text/html}
track_view = !!(explicit_track_view || implicit_track_view)
has_auth_cookie = Auth::DefaultCurrentUserProvider.find_v0_auth_cookie(request).present?
has_auth_cookie ||= Auth::DefaultCurrentUserProvider.find_v1_auth_cookie(env).present?
@ -118,6 +148,10 @@ class Middleware::RequestTracker
is_message_bus = request.path.start_with?("#{Discourse.base_path}/message-bus/")
is_topic_timings = request.path.start_with?("#{Discourse.base_path}/topics/timings")
# This header is sent on a follow-up request after a real browser loads up a page
# see `scripts/pageview.js` and `instance-initializers/page-tracking.js`
has_deferred_track_header = %w[1 true].include?(env["HTTP_DISCOURSE_DEFERRED_TRACK_VIEW"])
h = {
status: status,
is_crawler: helper.is_crawler?,
@ -129,6 +163,8 @@ class Middleware::RequestTracker
track_view: track_view,
timing: timing,
queue_seconds: env["REQUEST_QUEUE_SECONDS"],
explicit_track_view: explicit_track_view,
deferred_track: has_deferred_track_header,
}
if h[:is_background]
@ -166,7 +202,8 @@ class Middleware::RequestTracker
data =
begin
self.class.get_data(env, result, info, request)
rescue StandardError
rescue StandardError => e
Rails.logger.warn("RequestTracker.get_data failed: #{e}")
nil
end