mirror of
https://github.com/discourse/discourse.git
synced 2025-05-22 07:53:49 +08:00
FEATURE: control which web crawlers can access using a whitelist or blacklist
This commit is contained in:
76
app/models/web_crawler_request.rb
Normal file
76
app/models/web_crawler_request.rb
Normal file
@ -0,0 +1,76 @@
|
||||
class WebCrawlerRequest < ActiveRecord::Base
|
||||
include CachedCounting
|
||||
|
||||
# auto flush if older than this
|
||||
self.autoflush_seconds = 1.hour
|
||||
|
||||
cattr_accessor :max_record_age, :max_records_per_day
|
||||
|
||||
# only keep the top records based on request count
|
||||
self.max_records_per_day = 200
|
||||
|
||||
# delete records older than this
|
||||
self.max_record_age = 30.days
|
||||
|
||||
def self.increment!(user_agent, opts = nil)
|
||||
ua_list_key = user_agent_list_key
|
||||
$redis.sadd(ua_list_key, user_agent)
|
||||
$redis.expire(ua_list_key, 259200) # 3.days
|
||||
|
||||
perform_increment!(redis_key(user_agent), opts)
|
||||
end
|
||||
|
||||
def self.write_cache!(date = nil)
|
||||
if date.nil?
|
||||
write_cache!(Time.now.utc)
|
||||
write_cache!(Time.now.utc.yesterday)
|
||||
return
|
||||
end
|
||||
|
||||
self.last_flush = Time.now.utc
|
||||
|
||||
date = date.to_date
|
||||
|
||||
$redis.smembers(user_agent_list_key(date)).each do |user_agent, _|
|
||||
|
||||
val = get_and_reset(redis_key(user_agent, date))
|
||||
|
||||
next if val == 0
|
||||
|
||||
self.where(id: req_id(date, user_agent)).update_all(["count = count + ?", val])
|
||||
end
|
||||
rescue Redis::CommandError => e
|
||||
raise unless e.message =~ /READONLY/
|
||||
nil
|
||||
end
|
||||
|
||||
def self.clear_cache!(date = nil)
|
||||
if date.nil?
|
||||
clear_cache!(Time.now.utc)
|
||||
clear_cache!(Time.now.utc.yesterday)
|
||||
return
|
||||
end
|
||||
|
||||
list_key = user_agent_list_key(date)
|
||||
|
||||
$redis.smembers(list_key).each do |user_agent, _|
|
||||
$redis.del redis_key(user_agent, date)
|
||||
end
|
||||
|
||||
$redis.del list_key
|
||||
end
|
||||
|
||||
protected
|
||||
|
||||
def self.user_agent_list_key(time = Time.now.utc)
|
||||
"crawl_ua_list:#{time.strftime('%Y%m%d')}"
|
||||
end
|
||||
|
||||
def self.redis_key(user_agent, time = Time.now.utc)
|
||||
"crawl_req:#{time.strftime('%Y%m%d')}:#{user_agent}"
|
||||
end
|
||||
|
||||
def self.req_id(date, user_agent)
|
||||
request_id(date: date, user_agent: user_agent)
|
||||
end
|
||||
end
|
Reference in New Issue
Block a user