mirror of
https://github.com/discourse/discourse.git
synced 2025-05-23 10:43:40 +08:00
FIX: Google Groups crawler failed to login
Trying to automate the login into a Google account is quite hard. This makes the crawler use the content of a cookies.txt file instead. It also removes a couple of deprecation warnings and adds some color to the output.
This commit is contained in:
@ -6,22 +6,22 @@ require "bundler/inline"
|
|||||||
gemfile(true) do
|
gemfile(true) do
|
||||||
source "https://rubygems.org"
|
source "https://rubygems.org"
|
||||||
|
|
||||||
gem "nokogiri"
|
|
||||||
gem "webdrivers"
|
gem "webdrivers"
|
||||||
|
gem "colored2"
|
||||||
end
|
end
|
||||||
|
|
||||||
require "fileutils"
|
require "fileutils"
|
||||||
require "nokogiri"
|
|
||||||
require "optparse"
|
require "optparse"
|
||||||
require "webdrivers"
|
|
||||||
require "set"
|
require "set"
|
||||||
require "yaml"
|
require "yaml"
|
||||||
|
|
||||||
DEFAULT_OUTPUT_PATH = "/shared/import/data"
|
DEFAULT_OUTPUT_PATH = "/shared/import/data"
|
||||||
|
DEFAULT_COOKIES_TXT = "/shared/import/cookies.txt"
|
||||||
|
|
||||||
def driver
|
def driver
|
||||||
@driver ||= begin
|
@driver ||= begin
|
||||||
chrome_args = ["headless", "disable-gpu"]
|
chrome_args = ["disable-gpu"]
|
||||||
|
chrome_args << "headless" unless ENV["NOT_HEADLESS"] == '1'
|
||||||
chrome_args << "no-sandbox" if inside_container?
|
chrome_args << "no-sandbox" if inside_container?
|
||||||
options = Selenium::WebDriver::Chrome::Options.new(args: chrome_args)
|
options = Selenium::WebDriver::Chrome::Options.new(args: chrome_args)
|
||||||
Selenium::WebDriver.for(:chrome, options: options)
|
Selenium::WebDriver.for(:chrome, options: options)
|
||||||
@ -63,7 +63,7 @@ def find(css, parent_element = driver)
|
|||||||
begin
|
begin
|
||||||
retries ||= 0
|
retries ||= 0
|
||||||
parent_element.find_element(css: css)
|
parent_element.find_element(css: css)
|
||||||
rescue Net::ReadTimeout, Selenium::WebDriver::Error::ElementNotVisibleError
|
rescue Net::ReadTimeout, Selenium::WebDriver::Error::ElementNotInteractableError
|
||||||
sleep retries
|
sleep retries
|
||||||
retry if (retries += 1) < MAX_FIND_RETRIES
|
retry if (retries += 1) < MAX_FIND_RETRIES
|
||||||
end
|
end
|
||||||
@ -83,7 +83,7 @@ end
|
|||||||
|
|
||||||
def crawl_topic(url)
|
def crawl_topic(url)
|
||||||
if @scraped_topic_urls.include?(url)
|
if @scraped_topic_urls.include?(url)
|
||||||
puts "Skipping #{url}"
|
puts "Skipping".green << " #{url}"
|
||||||
return
|
return
|
||||||
end
|
end
|
||||||
|
|
||||||
@ -99,7 +99,7 @@ def crawl_topic(url)
|
|||||||
|
|
||||||
@scraped_topic_urls << url
|
@scraped_topic_urls << url
|
||||||
rescue
|
rescue
|
||||||
puts "Failed to scrape topic at #{url}"
|
puts "Failed to scrape topic at #{url}".red
|
||||||
raise
|
raise
|
||||||
end
|
end
|
||||||
|
|
||||||
@ -113,7 +113,7 @@ def crawl_message(url, might_be_deleted)
|
|||||||
@first_message_checked = true
|
@first_message_checked = true
|
||||||
|
|
||||||
if content.match?(/From:.*\.\.\.@.*/i) && !@force_import
|
if content.match?(/From:.*\.\.\.@.*/i) && !@force_import
|
||||||
exit_with_error(<<~MSG)
|
exit_with_error(<<~MSG.red.bold)
|
||||||
It looks like you do not have permissions to see email addresses. Aborting.
|
It looks like you do not have permissions to see email addresses. Aborting.
|
||||||
Use the --force option to import anyway.
|
Use the --force option to import anyway.
|
||||||
MSG
|
MSG
|
||||||
@ -125,67 +125,44 @@ rescue Selenium::WebDriver::Error::NoSuchElementError
|
|||||||
raise unless might_be_deleted
|
raise unless might_be_deleted
|
||||||
puts "Message might be deleted. Skipping #{url}"
|
puts "Message might be deleted. Skipping #{url}"
|
||||||
rescue
|
rescue
|
||||||
puts "Failed to scrape message at #{url}"
|
puts "Failed to scrape message at #{url}".red
|
||||||
raise
|
raise
|
||||||
end
|
end
|
||||||
|
|
||||||
def login
|
def login
|
||||||
puts "Logging in..."
|
puts "Logging in..."
|
||||||
get("https://www.google.com/accounts/Login")
|
get("https://google.com/404")
|
||||||
|
|
||||||
sleep(1)
|
add_cookies(
|
||||||
email_element = wait_for_element("input[type='email']")
|
"accounts.google.com",
|
||||||
exit_with_error("Failed to detect 'email' input on login page") if !email_element
|
"myaccount.google.com",
|
||||||
|
"google.com"
|
||||||
|
)
|
||||||
|
|
||||||
driver.action.move_to(email_element)
|
get("https://accounts.google.com/servicelogin")
|
||||||
email_element.send_keys(@email)
|
|
||||||
email_element.send_keys("\n")
|
|
||||||
|
|
||||||
sleep(1)
|
begin
|
||||||
password_element = wait_for_element("input[type='password']")
|
wait_for_url { |url| url.start_with?("https://myaccount.google.com") }
|
||||||
exit_with_error("Failed to detect 'password' input on login page") if !password_element
|
rescue Selenium::WebDriver::Error::TimeoutError
|
||||||
|
exit_with_error("Failed to login. Please check the content of your cookies.txt".red.bold)
|
||||||
driver.action.move_to(password_element)
|
|
||||||
password_element.send_keys(@password)
|
|
||||||
password_element.send_keys("\n")
|
|
||||||
|
|
||||||
sleep(1)
|
|
||||||
|
|
||||||
if driver.current_url.include?("challenge")
|
|
||||||
puts "", "2-Step Verification is required."
|
|
||||||
puts "Unlock on your phone and press Enter"
|
|
||||||
puts "or enter the code from your authenticator app"
|
|
||||||
puts "or enter the code you received via SMS (without the G- prefix)"
|
|
||||||
|
|
||||||
print "Enter code: "
|
|
||||||
|
|
||||||
code = gets.chomp
|
|
||||||
|
|
||||||
if code.empty?
|
|
||||||
# Verification via phone?
|
|
||||||
begin
|
|
||||||
wait_for_url { |url| !url.include?("challenge") }
|
|
||||||
rescue Selenium::WebDriver::Error::TimeOutError
|
|
||||||
exit_with_error("Failed to login. Did you tap 'Yes' on your phone to allow the login?")
|
|
||||||
end
|
|
||||||
else
|
|
||||||
code_element = wait_for_element("input[type='tel']")
|
|
||||||
exit_with_error("Failed to detect 'code' input on login page") if !code_element
|
|
||||||
|
|
||||||
code_element.send_keys(code)
|
|
||||||
code_element.send_keys("\n")
|
|
||||||
|
|
||||||
begin
|
|
||||||
wait_for_url { |url| !url.include?("challenge") }
|
|
||||||
rescue Selenium::WebDriver::Error::TimeOutError
|
|
||||||
exit_with_error("Failed to login. Wrong code?")
|
|
||||||
end
|
|
||||||
end
|
|
||||||
end
|
end
|
||||||
|
end
|
||||||
|
|
||||||
sleep(1)
|
def add_cookies(*domains)
|
||||||
user_element = wait_for_element("a[aria-label*='#{@email}']")
|
File.readlines(@cookies).each do |line|
|
||||||
exit_with_error("Failed to login") if !user_element
|
parts = line.chomp.split("\t")
|
||||||
|
next if parts.size != 7 || !domains.any? { |domain| parts[0] =~ /^\.?#{Regexp.escape(domain)}$/ }
|
||||||
|
|
||||||
|
driver.manage.add_cookie(
|
||||||
|
domain: parts[0],
|
||||||
|
httpOnly: "true".casecmp?(parts[1]),
|
||||||
|
path: parts[2],
|
||||||
|
secure: "true".casecmp?(parts[3]),
|
||||||
|
expires: parts[4] == "0" ? nil : DateTime.strptime(parts[4], "%s"),
|
||||||
|
name: parts[5],
|
||||||
|
value: parts[6]
|
||||||
|
)
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def wait_for_url
|
def wait_for_url
|
||||||
@ -193,14 +170,6 @@ def wait_for_url
|
|||||||
wait.until { yield(driver.current_url) }
|
wait.until { yield(driver.current_url) }
|
||||||
end
|
end
|
||||||
|
|
||||||
def wait_for_element(css)
|
|
||||||
wait = Selenium::WebDriver::Wait.new(timeout: 5)
|
|
||||||
wait.until { driver.find_element(css: css).displayed? }
|
|
||||||
find(css)
|
|
||||||
rescue Selenium::WebDriver::Error::TimeOutError
|
|
||||||
nil
|
|
||||||
end
|
|
||||||
|
|
||||||
def exit_with_error(*messages)
|
def exit_with_error(*messages)
|
||||||
STDERR.puts messages
|
STDERR.puts messages
|
||||||
exit 1
|
exit 1
|
||||||
@ -231,9 +200,8 @@ def parse_arguments
|
|||||||
parser = OptionParser.new do |opts|
|
parser = OptionParser.new do |opts|
|
||||||
opts.banner = "Usage: google_groups.rb [options]"
|
opts.banner = "Usage: google_groups.rb [options]"
|
||||||
|
|
||||||
opts.on("-e", "--email EMAIL", "email address of group admin or manager") { |v| @email = v }
|
|
||||||
opts.on("-p", "--password PASSWORD", "password of group admin or manager") { |v| @password = v }
|
|
||||||
opts.on("-g", "--groupname GROUPNAME") { |v| @groupname = v }
|
opts.on("-g", "--groupname GROUPNAME") { |v| @groupname = v }
|
||||||
|
opts.on("-c", "--cookies PATH", "path to cookies.txt") { |v| @cookies = v }
|
||||||
opts.on("--path PATH", "output path for emails") { |v| @path = v }
|
opts.on("--path PATH", "output path for emails") { |v| @path = v }
|
||||||
opts.on("-f", "--force", "force import when user isn't allowed to see email addresses") { @force_import = true }
|
opts.on("-f", "--force", "force import when user isn't allowed to see email addresses") { @force_import = true }
|
||||||
opts.on("-h", "--help") do
|
opts.on("-h", "--help") do
|
||||||
@ -248,14 +216,15 @@ def parse_arguments
|
|||||||
exit_with_error(e.message, "", parser)
|
exit_with_error(e.message, "", parser)
|
||||||
end
|
end
|
||||||
|
|
||||||
mandatory = [:email, :password, :groupname]
|
@cookies = DEFAULT_COOKIES_TXT if @cookies.nil? && File.exist?(DEFAULT_COOKIES_TXT)
|
||||||
|
@path = File.join(DEFAULT_OUTPUT_PATH, @groupname) if @path.nil?
|
||||||
|
|
||||||
|
mandatory = [:groupname, :cookies]
|
||||||
missing = mandatory.select { |name| instance_variable_get("@#{name}").nil? }
|
missing = mandatory.select { |name| instance_variable_get("@#{name}").nil? }
|
||||||
|
|
||||||
if missing.any?
|
exit_with_error("Missing arguments: #{missing.join(', ')}".red.bold, "", parser, "") if missing.any?
|
||||||
exit_with_error("Missing arguments: #{missing.join(', ')}", "", parser)
|
exit_with_error("cookies.txt not found at #{@cookies}".red.bold, "") if !File.exist?(@cookies)
|
||||||
end
|
|
||||||
|
|
||||||
@path = File.join(DEFAULT_OUTPUT_PATH, @groupname) if @path.nil?
|
|
||||||
FileUtils.mkpath(@path)
|
FileUtils.mkpath(@path)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user