mirror of
https://github.com/discourse/discourse.git
synced 2025-05-23 08:51:05 +08:00
Merge pull request #2703 from riking/email-pr-receiver
Heavy refactor for Email::Receiver
This commit is contained in:
120
lib/email/html_cleaner.rb
Normal file
120
lib/email/html_cleaner.rb
Normal file
@ -0,0 +1,120 @@
|
||||
module Email
|
||||
# HtmlCleaner cleans up the extremely dirty HTML that many email clients
|
||||
# generate by stripping out any excess divs or spans, removing styling in
|
||||
# the process (which also makes the html more suitable to be parsed as
|
||||
# Markdown).
|
||||
class HtmlCleaner
|
||||
# Elements to hoist all children out of
|
||||
HTML_HOIST_ELEMENTS = %w(div span font table tbody th tr td)
|
||||
# Node types to always delete
|
||||
HTML_DELETE_ELEMENT_TYPES = [Nokogiri::XML::Node::DTD_NODE,
|
||||
Nokogiri::XML::Node::COMMENT_NODE,
|
||||
]
|
||||
|
||||
# Private variables:
|
||||
# @doc - nokogiri document
|
||||
# @out - same as @doc, but only if trimming has occured
|
||||
def initialize(html)
|
||||
if String === html
|
||||
@doc = Nokogiri::HTML(html)
|
||||
else
|
||||
@doc = html
|
||||
end
|
||||
end
|
||||
|
||||
class << self
|
||||
# Email::HtmlCleaner.trim(inp, opts={})
|
||||
#
|
||||
# Arguments:
|
||||
# inp - Either a HTML string or a Nokogiri document.
|
||||
# Options:
|
||||
# :return => :doc, :string
|
||||
# Specify the desired return type.
|
||||
# Defaults to the type of the input.
|
||||
# A value of :string is equivalent to calling get_document_text()
|
||||
# on the returned document.
|
||||
def trim(inp, opts={})
|
||||
cleaner = HtmlCleaner.new(inp)
|
||||
|
||||
opts[:return] ||= ((String === inp) ? :string : :doc)
|
||||
|
||||
if opts[:return] == :string
|
||||
cleaner.output_html
|
||||
else
|
||||
cleaner.output_document
|
||||
end
|
||||
end
|
||||
|
||||
# Email::HtmlCleaner.get_document_text(doc)
|
||||
#
|
||||
# Get the body portion of the document, including html, as a string.
|
||||
def get_document_text(doc)
|
||||
body = doc.xpath('//body')
|
||||
if body
|
||||
body.inner_html
|
||||
else
|
||||
doc.inner_html
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def output_document
|
||||
@out ||= begin
|
||||
doc = @doc
|
||||
trim_process_node doc
|
||||
add_newlines doc
|
||||
doc
|
||||
end
|
||||
end
|
||||
|
||||
def output_html
|
||||
HtmlCleaner.get_document_text(output_document)
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def add_newlines(doc)
|
||||
doc.xpath('//br').each do |br|
|
||||
br.replace(Nokogiri::XML::Text.new("\n", doc))
|
||||
end
|
||||
end
|
||||
|
||||
def trim_process_node(node)
|
||||
if should_hoist?(node)
|
||||
hoisted = trim_hoist_element node
|
||||
hoisted.each { |child| trim_process_node child }
|
||||
elsif should_delete?(node)
|
||||
node.remove
|
||||
else
|
||||
if children = node.children
|
||||
children.each { |child| trim_process_node child }
|
||||
end
|
||||
end
|
||||
|
||||
node
|
||||
end
|
||||
|
||||
def trim_hoist_element(element)
|
||||
hoisted = []
|
||||
element.children.each do |child|
|
||||
element.before(child)
|
||||
hoisted << child
|
||||
end
|
||||
element.remove
|
||||
hoisted
|
||||
end
|
||||
|
||||
def should_hoist?(node)
|
||||
return false unless node.element?
|
||||
HTML_HOIST_ELEMENTS.include? node.name
|
||||
end
|
||||
|
||||
def should_delete?(node)
|
||||
return true if HTML_DELETE_ELEMENT_TYPES.include? node.type
|
||||
return true if node.element? && node.name == 'head'
|
||||
return true if node.text? && node.text.strip.blank?
|
||||
|
||||
false
|
||||
end
|
||||
end
|
||||
end
|
@ -1,3 +1,4 @@
|
||||
require 'email/html_cleaner'
|
||||
#
|
||||
# Handles an incoming message
|
||||
#
|
||||
@ -26,20 +27,12 @@ module Email
|
||||
def process
|
||||
raise EmptyEmailError if @raw.blank?
|
||||
|
||||
@message = Mail.new(@raw)
|
||||
message = Mail.new(@raw)
|
||||
|
||||
# First remove the known discourse stuff.
|
||||
parse_body
|
||||
raise EmptyEmailError if @body.blank?
|
||||
|
||||
# Then run the github EmailReplyParser on it in case we didn't catch it
|
||||
@body = EmailReplyParser.read(@body).visible_text.force_encoding('UTF-8')
|
||||
|
||||
discourse_email_parser
|
||||
raise EmailUnparsableError if @body.blank?
|
||||
body = parse_body message
|
||||
|
||||
dest_info = {type: :invalid, obj: nil}
|
||||
@message.to.each do |to_address|
|
||||
message.to.each do |to_address|
|
||||
if dest_info[:type] == :invalid
|
||||
dest_info = check_address to_address
|
||||
end
|
||||
@ -47,6 +40,10 @@ module Email
|
||||
|
||||
raise BadDestinationAddress if dest_info[:type] == :invalid
|
||||
|
||||
# TODO get to a state where we can remove this
|
||||
@message = message
|
||||
@body = body
|
||||
|
||||
if dest_info[:type] == :category
|
||||
raise BadDestinationAddress unless SiteSetting.email_in
|
||||
category = dest_info[:obj]
|
||||
@ -74,6 +71,8 @@ module Email
|
||||
|
||||
create_reply
|
||||
end
|
||||
rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError => e
|
||||
raise EmailUnparsableError.new(e)
|
||||
end
|
||||
|
||||
def check_address(address)
|
||||
@ -94,57 +93,64 @@ module Email
|
||||
{type: :invalid, obj: nil}
|
||||
end
|
||||
|
||||
private
|
||||
def parse_body(message)
|
||||
body = select_body message
|
||||
encoding = body.encoding
|
||||
raise EmptyEmailError if body.strip.blank?
|
||||
|
||||
def parse_body
|
||||
body = discourse_email_trimmer body
|
||||
raise EmptyEmailError if body.strip.blank?
|
||||
|
||||
body = EmailReplyParser.parse_reply body
|
||||
raise EmptyEmailError if body.strip.blank?
|
||||
|
||||
body.force_encoding(encoding).encode("UTF-8")
|
||||
end
|
||||
|
||||
def select_body(message)
|
||||
html = nil
|
||||
|
||||
# If the message is multipart, find the best type for our purposes
|
||||
if @message.multipart?
|
||||
if p = @message.text_part
|
||||
@body = p.charset ? p.body.decoded.force_encoding(p.charset).encode("UTF-8").to_s : p.body.to_s
|
||||
return @body
|
||||
elsif p = @message.html_part
|
||||
html = p.charset ? p.body.decoded.force_encoding(p.charset).encode("UTF-8").to_s : p.body.to_s
|
||||
# If the message is multipart, return that part (favor html)
|
||||
if message.multipart?
|
||||
html = fix_charset message.html_part
|
||||
text = fix_charset message.text_part
|
||||
# TODO picking text if available may be better
|
||||
if text && !html
|
||||
return text
|
||||
end
|
||||
elsif message.content_type =~ /text\/html/
|
||||
html = fix_charset message
|
||||
end
|
||||
|
||||
if @message.content_type =~ /text\/html/
|
||||
if defined? @message.charset
|
||||
html = @message.body.decoded.force_encoding(@message.charset).encode("UTF-8").to_s
|
||||
else
|
||||
html = @message.body.to_s
|
||||
end
|
||||
if html
|
||||
body = HtmlCleaner.new(html).output_html
|
||||
else
|
||||
body = fix_charset message
|
||||
end
|
||||
|
||||
if html.present?
|
||||
@body = scrub_html(html)
|
||||
return @body
|
||||
end
|
||||
|
||||
@body = @message.charset ? @message.body.decoded.force_encoding(@message.charset).encode("UTF-8").to_s.strip : @message.body.to_s
|
||||
|
||||
# Certain trigger phrases that means we didn't parse correctly
|
||||
@body = nil if @body =~ /Content\-Type\:/ ||
|
||||
@body =~ /multipart\/alternative/ ||
|
||||
@body =~ /text\/plain/
|
||||
if body =~ /Content\-Type\:/ || body =~ /multipart\/alternative/ || body =~ /text\/plain/
|
||||
raise EmptyEmailError
|
||||
end
|
||||
|
||||
@body
|
||||
body
|
||||
end
|
||||
|
||||
def scrub_html(html)
|
||||
# If we have an HTML message, strip the markup
|
||||
doc = Nokogiri::HTML(html)
|
||||
# Force encoding to UTF-8 on a Mail::Message or Mail::Part
|
||||
def fix_charset(object)
|
||||
return nil if object.nil?
|
||||
|
||||
# Blackberry is annoying in that it only provides HTML. We can easily extract it though
|
||||
content = doc.at("#BB10_response_div")
|
||||
return content.text if content.present?
|
||||
|
||||
doc.xpath("//text()").text
|
||||
if object.charset
|
||||
object.body.decoded.force_encoding(object.charset).encode("UTF-8").to_s
|
||||
else
|
||||
object.body.to_s
|
||||
end
|
||||
end
|
||||
|
||||
def discourse_email_parser
|
||||
lines = @body.scrub.lines.to_a
|
||||
REPLYING_HEADER_LABELS = ['From', 'Sent', 'To', 'Subject', 'Reply To']
|
||||
REPLYING_HEADER_REGEX = Regexp.union(REPLYING_HEADER_LABELS.map { |lbl| "#{lbl}:" })
|
||||
|
||||
def discourse_email_trimmer(body)
|
||||
lines = body.scrub.lines.to_a
|
||||
range_end = 0
|
||||
|
||||
lines.each_with_index do |l, idx|
|
||||
@ -155,11 +161,15 @@ module Email
|
||||
# Let's try it and see how well it works.
|
||||
(l =~ /\d{4}/ && l =~ /\d:\d\d/ && l =~ /\:$/)
|
||||
|
||||
# Headers on subsequent lines
|
||||
break if (0..2).all? { |off| lines[idx+off] =~ REPLYING_HEADER_REGEX }
|
||||
# Headers on the same line
|
||||
break if REPLYING_HEADER_LABELS.count { |lbl| l.include? lbl } >= 3
|
||||
|
||||
range_end = idx
|
||||
end
|
||||
|
||||
@body = lines[0..range_end].join
|
||||
@body.strip!
|
||||
lines[0..range_end].join.strip
|
||||
end
|
||||
|
||||
def wrap_body_in_quote(user_email)
|
||||
@ -168,37 +178,37 @@ module Email
|
||||
[/quote]"
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def create_reply
|
||||
create_post_with_attachments(email_log.user, @body, @email_log.topic_id, @email_log.post.post_number)
|
||||
create_post_with_attachments(@email_log.user,
|
||||
raw: @body,
|
||||
topic_id: @email_log.topic_id,
|
||||
reply_to_post_number: @email_log.post.post_number)
|
||||
end
|
||||
|
||||
def create_new_topic
|
||||
topic = TopicCreator.new(
|
||||
@user,
|
||||
Guardian.new(@user),
|
||||
category: @category_id,
|
||||
title: @message.subject,
|
||||
).create
|
||||
|
||||
post = create_post_with_attachments(@user, @body, topic.id)
|
||||
post = create_post_with_attachments(@user,
|
||||
raw: @body,
|
||||
title: @message.subject,
|
||||
category: @category_id)
|
||||
|
||||
EmailLog.create(
|
||||
email_type: "topic_via_incoming_email",
|
||||
to_address: @message.to.first,
|
||||
topic_id: topic.id,
|
||||
to_address: @message.from.first, # pick from address because we want the user's email
|
||||
topic_id: post.topic.id,
|
||||
user_id: @user.id,
|
||||
)
|
||||
|
||||
post
|
||||
end
|
||||
|
||||
def create_post_with_attachments(user, raw, topic_id, reply_to_post_number=nil)
|
||||
def create_post_with_attachments(user, post_opts={})
|
||||
options = {
|
||||
raw: raw,
|
||||
topic_id: topic_id,
|
||||
cooking_options: { traditional_markdown_linebreaks: true },
|
||||
}
|
||||
options[:reply_to_post_number] = reply_to_post_number if reply_to_post_number
|
||||
}.merge(post_opts)
|
||||
|
||||
raw = options[:raw]
|
||||
|
||||
# deal with attachments
|
||||
@message.attachments.each do |attachment|
|
||||
@ -215,9 +225,10 @@ module Email
|
||||
ensure
|
||||
tmp.close!
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
options[:raw] = raw
|
||||
|
||||
create_post(user, options)
|
||||
end
|
||||
|
||||
@ -232,9 +243,11 @@ module Email
|
||||
def create_post(user, options)
|
||||
creator = PostCreator.new(user, options)
|
||||
post = creator.create
|
||||
|
||||
if creator.errors.present?
|
||||
raise InvalidPost, creator.errors.full_messages.join("\n")
|
||||
end
|
||||
|
||||
post
|
||||
end
|
||||
|
||||
|
Reference in New Issue
Block a user