FIX: Improve HTML to Markdown conversion (#9231)

This commit ensures that whitespaces are preserved in <pre>, but removed
inside text paragraphs.
This commit is contained in:
Dan Ungureanu
2020-03-18 19:31:10 +02:00
committed by GitHub
parent 778454e26b
commit 1393950dbc
2 changed files with 16 additions and 3 deletions

View File

@ -31,7 +31,7 @@ class HtmlToMarkdown
def remove_whitespaces!
@doc.traverse do |node|
if node.is_a? Nokogiri::XML::Text
if node.is_a?(Nokogiri::XML::Text) && node.parent.name != "pre"
node.content = node.content.gsub(/\A[[:space:]]+/, "") if node.previous_element&.description&.block?
node.content = node.content.gsub(/\A[[:space:]]+/, "") if node.previous_element.nil? && node.parent.description&.block?
node.content = node.content.gsub(/[[:space:]]+\z/, "") if node.next_element&.description&.block?
@ -220,10 +220,21 @@ class HtmlToMarkdown
end
def visit_text(node)
top_block = @stack[-1]
if top_block.name == "pre"
top_block.markdown << node.text
return
end
node.content = node.content.gsub(/\A[[:space:]]+/, "") if node.previous_element.nil? && EMPHASIS.include?(node.parent.name)
indent = node.text[/^\s+/] || ""
if top_block.markdown.present? && indent = node.text[/^\s+/]
top_block.markdown << indent
end
text = node.text.gsub(/^\s+/, "").gsub(/\s{2,}/, " ")
@stack[-1].markdown << [indent, text].join("")
top_block.markdown << text
end
def format_block