FEATURE: control which web crawlers can access using a whitelist or blacklist

2025-05-22 16:34:31 +08:00 · 2018-03-15 17:10:45 -04:00
parent cbbeedf53b
commit ced7e9a691
22 changed files with 722 additions and 97 deletions
--- a/spec/models/web_crawler_request_spec.rb
+++ b/spec/models/web_crawler_request_spec.rb
@ -0,0 +1,114 @@
+require 'rails_helper'
+
+describe WebCrawlerRequest do
+  before do
+    WebCrawlerRequest.last_flush = Time.now.utc
+    WebCrawlerRequest.clear_cache!
+  end
+
+  after do
+    WebCrawlerRequest.clear_cache!
+  end
+
+  def inc(user_agent, opts = nil)
+    WebCrawlerRequest.increment!(user_agent, opts)
+  end
+
+  def disable_date_flush!
+    freeze_time(Time.now)
+    WebCrawlerRequest.last_flush = Time.now.utc
+  end
+
+  def web_crawler_request(user_agent)
+    WebCrawlerRequest.where(user_agent: user_agent).first
+  end
+
+  it 'works even if redis is in readonly' do
+    disable_date_flush!
+
+    inc('Googlebot')
+    inc('Googlebot')
+
+    $redis.without_namespace.stubs(:incr).raises(Redis::CommandError.new("READONLY"))
+    $redis.without_namespace.stubs(:eval).raises(Redis::CommandError.new("READONLY"))
+
+    inc('Googlebot', autoflush: 3)
+    WebCrawlerRequest.write_cache!
+
+    $redis.without_namespace.unstub(:incr)
+    $redis.without_namespace.unstub(:eval)
+
+    inc('Googlebot', autoflush: 3)
+    expect(web_crawler_request('Googlebot').count).to eq(3)
+  end
+
+  it 'logs nothing for an unflushed increment' do
+    WebCrawlerRequest.increment!('Googlebot')
+    expect(WebCrawlerRequest.count).to eq(0)
+  end
+
+  it 'can automatically flush' do
+    disable_date_flush!
+
+    inc('Googlebot', autoflush: 3)
+    expect(web_crawler_request('Googlebot')).to_not be_present
+    expect(WebCrawlerRequest.count).to eq(0)
+    inc('Googlebot', autoflush: 3)
+    expect(web_crawler_request('Googlebot')).to_not be_present
+    inc('Googlebot', autoflush: 3)
+    expect(web_crawler_request('Googlebot').count).to eq(3)
+    expect(WebCrawlerRequest.count).to eq(1)
+
+    3.times { inc('Googlebot', autoflush: 3) }
+    expect(web_crawler_request('Googlebot').count).to eq(6)
+    expect(WebCrawlerRequest.count).to eq(1)
+  end
+
+  it 'can flush based on time' do
+    t1 = Time.now.utc.at_midnight
+    freeze_time(t1)
+    WebCrawlerRequest.write_cache!
+    inc('Googlebot')
+    expect(WebCrawlerRequest.count).to eq(0)
+
+    freeze_time(t1 + WebCrawlerRequest.autoflush_seconds + 1)
+    inc('Googlebot')
+
+    expect(WebCrawlerRequest.count).to eq(1)
+  end
+
+  it 'flushes yesterdays results' do
+    t1 = Time.now.utc.at_midnight
+    freeze_time(t1)
+    inc('Googlebot')
+    freeze_time(t1.tomorrow)
+    inc('Googlebot')
+
+    WebCrawlerRequest.write_cache!
+    expect(WebCrawlerRequest.count).to eq(2)
+  end
+
+  it 'clears cache correctly' do
+    inc('Googlebot')
+    inc('Twitterbot')
+    WebCrawlerRequest.clear_cache!
+    WebCrawlerRequest.write_cache!
+
+    expect(WebCrawlerRequest.count).to eq(0)
+  end
+
+  it 'logs a few counts once flushed' do
+    time = Time.now.at_midnight
+    freeze_time(time)
+
+    3.times { inc('Googlebot') }
+    2.times { inc('Twitterbot') }
+    4.times { inc('Bingbot') }
+
+    WebCrawlerRequest.write_cache!
+
+    expect(web_crawler_request('Googlebot').count).to eq(3)
+    expect(web_crawler_request('Twitterbot').count).to eq(2)
+    expect(web_crawler_request('Bingbot').count).to eq(4)
+  end
+end