Merge pull request #2703 from riking/email-pr-receiver

Heavy refactor for Email::Receiver
This commit is contained in:
Sam
2014-08-29 09:59:59 +10:00
11 changed files with 457 additions and 249 deletions

120
lib/email/html_cleaner.rb Normal file
View File

@ -0,0 +1,120 @@
module Email
# HtmlCleaner cleans up the extremely dirty HTML that many email clients
# generate by stripping out any excess divs or spans, removing styling in
# the process (which also makes the html more suitable to be parsed as
# Markdown).
class HtmlCleaner
# Elements to hoist all children out of
HTML_HOIST_ELEMENTS = %w(div span font table tbody th tr td)
# Node types to always delete
HTML_DELETE_ELEMENT_TYPES = [Nokogiri::XML::Node::DTD_NODE,
Nokogiri::XML::Node::COMMENT_NODE,
]
# Private variables:
# @doc - nokogiri document
# @out - same as @doc, but only if trimming has occured
def initialize(html)
if String === html
@doc = Nokogiri::HTML(html)
else
@doc = html
end
end
class << self
# Email::HtmlCleaner.trim(inp, opts={})
#
# Arguments:
# inp - Either a HTML string or a Nokogiri document.
# Options:
# :return => :doc, :string
# Specify the desired return type.
# Defaults to the type of the input.
# A value of :string is equivalent to calling get_document_text()
# on the returned document.
def trim(inp, opts={})
cleaner = HtmlCleaner.new(inp)
opts[:return] ||= ((String === inp) ? :string : :doc)
if opts[:return] == :string
cleaner.output_html
else
cleaner.output_document
end
end
# Email::HtmlCleaner.get_document_text(doc)
#
# Get the body portion of the document, including html, as a string.
def get_document_text(doc)
body = doc.xpath('//body')
if body
body.inner_html
else
doc.inner_html
end
end
end
def output_document
@out ||= begin
doc = @doc
trim_process_node doc
add_newlines doc
doc
end
end
def output_html
HtmlCleaner.get_document_text(output_document)
end
private
def add_newlines(doc)
doc.xpath('//br').each do |br|
br.replace(Nokogiri::XML::Text.new("\n", doc))
end
end
def trim_process_node(node)
if should_hoist?(node)
hoisted = trim_hoist_element node
hoisted.each { |child| trim_process_node child }
elsif should_delete?(node)
node.remove
else
if children = node.children
children.each { |child| trim_process_node child }
end
end
node
end
def trim_hoist_element(element)
hoisted = []
element.children.each do |child|
element.before(child)
hoisted << child
end
element.remove
hoisted
end
def should_hoist?(node)
return false unless node.element?
HTML_HOIST_ELEMENTS.include? node.name
end
def should_delete?(node)
return true if HTML_DELETE_ELEMENT_TYPES.include? node.type
return true if node.element? && node.name == 'head'
return true if node.text? && node.text.strip.blank?
false
end
end
end

View File

@ -1,3 +1,4 @@
require 'email/html_cleaner'
#
# Handles an incoming message
#
@ -26,20 +27,12 @@ module Email
def process
raise EmptyEmailError if @raw.blank?
@message = Mail.new(@raw)
message = Mail.new(@raw)
# First remove the known discourse stuff.
parse_body
raise EmptyEmailError if @body.blank?
# Then run the github EmailReplyParser on it in case we didn't catch it
@body = EmailReplyParser.read(@body).visible_text.force_encoding('UTF-8')
discourse_email_parser
raise EmailUnparsableError if @body.blank?
body = parse_body message
dest_info = {type: :invalid, obj: nil}
@message.to.each do |to_address|
message.to.each do |to_address|
if dest_info[:type] == :invalid
dest_info = check_address to_address
end
@ -47,6 +40,10 @@ module Email
raise BadDestinationAddress if dest_info[:type] == :invalid
# TODO get to a state where we can remove this
@message = message
@body = body
if dest_info[:type] == :category
raise BadDestinationAddress unless SiteSetting.email_in
category = dest_info[:obj]
@ -74,6 +71,8 @@ module Email
create_reply
end
rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError => e
raise EmailUnparsableError.new(e)
end
def check_address(address)
@ -94,57 +93,64 @@ module Email
{type: :invalid, obj: nil}
end
private
def parse_body(message)
body = select_body message
encoding = body.encoding
raise EmptyEmailError if body.strip.blank?
def parse_body
body = discourse_email_trimmer body
raise EmptyEmailError if body.strip.blank?
body = EmailReplyParser.parse_reply body
raise EmptyEmailError if body.strip.blank?
body.force_encoding(encoding).encode("UTF-8")
end
def select_body(message)
html = nil
# If the message is multipart, find the best type for our purposes
if @message.multipart?
if p = @message.text_part
@body = p.charset ? p.body.decoded.force_encoding(p.charset).encode("UTF-8").to_s : p.body.to_s
return @body
elsif p = @message.html_part
html = p.charset ? p.body.decoded.force_encoding(p.charset).encode("UTF-8").to_s : p.body.to_s
# If the message is multipart, return that part (favor html)
if message.multipart?
html = fix_charset message.html_part
text = fix_charset message.text_part
# TODO picking text if available may be better
if text && !html
return text
end
elsif message.content_type =~ /text\/html/
html = fix_charset message
end
if @message.content_type =~ /text\/html/
if defined? @message.charset
html = @message.body.decoded.force_encoding(@message.charset).encode("UTF-8").to_s
else
html = @message.body.to_s
end
if html
body = HtmlCleaner.new(html).output_html
else
body = fix_charset message
end
if html.present?
@body = scrub_html(html)
return @body
end
@body = @message.charset ? @message.body.decoded.force_encoding(@message.charset).encode("UTF-8").to_s.strip : @message.body.to_s
# Certain trigger phrases that means we didn't parse correctly
@body = nil if @body =~ /Content\-Type\:/ ||
@body =~ /multipart\/alternative/ ||
@body =~ /text\/plain/
if body =~ /Content\-Type\:/ || body =~ /multipart\/alternative/ || body =~ /text\/plain/
raise EmptyEmailError
end
@body
body
end
def scrub_html(html)
# If we have an HTML message, strip the markup
doc = Nokogiri::HTML(html)
# Force encoding to UTF-8 on a Mail::Message or Mail::Part
def fix_charset(object)
return nil if object.nil?
# Blackberry is annoying in that it only provides HTML. We can easily extract it though
content = doc.at("#BB10_response_div")
return content.text if content.present?
doc.xpath("//text()").text
if object.charset
object.body.decoded.force_encoding(object.charset).encode("UTF-8").to_s
else
object.body.to_s
end
end
def discourse_email_parser
lines = @body.scrub.lines.to_a
REPLYING_HEADER_LABELS = ['From', 'Sent', 'To', 'Subject', 'Reply To']
REPLYING_HEADER_REGEX = Regexp.union(REPLYING_HEADER_LABELS.map { |lbl| "#{lbl}:" })
def discourse_email_trimmer(body)
lines = body.scrub.lines.to_a
range_end = 0
lines.each_with_index do |l, idx|
@ -155,11 +161,15 @@ module Email
# Let's try it and see how well it works.
(l =~ /\d{4}/ && l =~ /\d:\d\d/ && l =~ /\:$/)
# Headers on subsequent lines
break if (0..2).all? { |off| lines[idx+off] =~ REPLYING_HEADER_REGEX }
# Headers on the same line
break if REPLYING_HEADER_LABELS.count { |lbl| l.include? lbl } >= 3
range_end = idx
end
@body = lines[0..range_end].join
@body.strip!
lines[0..range_end].join.strip
end
def wrap_body_in_quote(user_email)
@ -168,37 +178,37 @@ module Email
[/quote]"
end
private
def create_reply
create_post_with_attachments(email_log.user, @body, @email_log.topic_id, @email_log.post.post_number)
create_post_with_attachments(@email_log.user,
raw: @body,
topic_id: @email_log.topic_id,
reply_to_post_number: @email_log.post.post_number)
end
def create_new_topic
topic = TopicCreator.new(
@user,
Guardian.new(@user),
category: @category_id,
title: @message.subject,
).create
post = create_post_with_attachments(@user, @body, topic.id)
post = create_post_with_attachments(@user,
raw: @body,
title: @message.subject,
category: @category_id)
EmailLog.create(
email_type: "topic_via_incoming_email",
to_address: @message.to.first,
topic_id: topic.id,
to_address: @message.from.first, # pick from address because we want the user's email
topic_id: post.topic.id,
user_id: @user.id,
)
post
end
def create_post_with_attachments(user, raw, topic_id, reply_to_post_number=nil)
def create_post_with_attachments(user, post_opts={})
options = {
raw: raw,
topic_id: topic_id,
cooking_options: { traditional_markdown_linebreaks: true },
}
options[:reply_to_post_number] = reply_to_post_number if reply_to_post_number
}.merge(post_opts)
raw = options[:raw]
# deal with attachments
@message.attachments.each do |attachment|
@ -215,9 +225,10 @@ module Email
ensure
tmp.close!
end
end
options[:raw] = raw
create_post(user, options)
end
@ -232,9 +243,11 @@ module Email
def create_post(user, options)
creator = PostCreator.new(user, options)
post = creator.create
if creator.errors.present?
raise InvalidPost, creator.errors.full_messages.join("\n")
end
post
end