FEATURE: control which web crawlers can access using a whitelist or blacklist

This commit is contained in:
Neil Lalonde
2018-03-15 17:10:45 -04:00
parent cbbeedf53b
commit ced7e9a691
22 changed files with 722 additions and 97 deletions

View File

@ -28,4 +28,25 @@ module CrawlerDetection
end
end
# Given a user_agent that returns true from crawler?, should its request be allowed?
def self.allow_crawler?(user_agent)
return true if SiteSetting.whitelisted_crawler_user_agents.blank? &&
SiteSetting.blacklisted_crawler_user_agents.blank?
@whitelisted_matchers ||= {}
@blacklisted_matchers ||= {}
if SiteSetting.whitelisted_crawler_user_agents.present?
whitelisted = @whitelisted_matchers[SiteSetting.whitelisted_crawler_user_agents] ||= to_matcher(SiteSetting.whitelisted_crawler_user_agents)
!user_agent.nil? && user_agent.match?(whitelisted)
else
blacklisted = @blacklisted_matchers[SiteSetting.blacklisted_crawler_user_agents] ||= to_matcher(SiteSetting.blacklisted_crawler_user_agents)
user_agent.nil? || !user_agent.match?(blacklisted)
end
end
def self.is_blocked_crawler?(user_agent)
crawler?(user_agent) && !allow_crawler?(user_agent)
end
end