From 1393950dbc44e7ee0390560079852c38cb0e9c2a Mon Sep 17 00:00:00 2001 From: Dan Ungureanu Date: Wed, 18 Mar 2020 19:31:10 +0200 Subject: [PATCH] FIX: Improve HTML to Markdown conversion (#9231) This commit ensures that whitespaces are preserved in
, but removed
inside text paragraphs.
---
 lib/html_to_markdown.rb                  | 17 ++++++++++++++---
 spec/components/html_to_markdown_spec.rb |  2 ++
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/lib/html_to_markdown.rb b/lib/html_to_markdown.rb
index a17a78eb2b0..a3eb5e8aaeb 100644
--- a/lib/html_to_markdown.rb
+++ b/lib/html_to_markdown.rb
@@ -31,7 +31,7 @@ class HtmlToMarkdown
 
   def remove_whitespaces!
     @doc.traverse do |node|
-      if node.is_a? Nokogiri::XML::Text
+      if node.is_a?(Nokogiri::XML::Text) && node.parent.name != "pre"
         node.content = node.content.gsub(/\A[[:space:]]+/, "") if node.previous_element&.description&.block?
         node.content = node.content.gsub(/\A[[:space:]]+/, "") if node.previous_element.nil? && node.parent.description&.block?
         node.content = node.content.gsub(/[[:space:]]+\z/, "") if node.next_element&.description&.block?
@@ -220,10 +220,21 @@ class HtmlToMarkdown
   end
 
   def visit_text(node)
+    top_block = @stack[-1]
+
+    if top_block.name == "pre"
+      top_block.markdown << node.text
+      return
+    end
+
     node.content = node.content.gsub(/\A[[:space:]]+/, "") if node.previous_element.nil? && EMPHASIS.include?(node.parent.name)
-    indent = node.text[/^\s+/] || ""
+
+    if top_block.markdown.present? && indent = node.text[/^\s+/]
+      top_block.markdown << indent
+    end
+
     text = node.text.gsub(/^\s+/, "").gsub(/\s{2,}/, " ")
-    @stack[-1].markdown << [indent, text].join("")
+    top_block.markdown << text
   end
 
   def format_block
diff --git a/spec/components/html_to_markdown_spec.rb b/spec/components/html_to_markdown_spec.rb
index aa89ac351c7..266e875aca6 100644
--- a/spec/components/html_to_markdown_spec.rb
+++ b/spec/components/html_to_markdown_spec.rb
@@ -204,6 +204,7 @@ describe HtmlToMarkdown do
     expect(html_to_markdown("
var foo = 'bar';
")).to eq("```\nvar foo = 'bar';\n```") expect(html_to_markdown("
var foo = 'bar';
")).to eq("```\nvar foo = 'bar';\n```") expect(html_to_markdown(%Q{
var foo = 'bar';
})).to eq("```javascript\nvar foo = 'bar';\n```") + expect(html_to_markdown("
    function f() {\n        console.log('Hello world!');\n    }
")).to eq("```\n function f() {\n console.log('Hello world!');\n }\n```") end it "supports
 inside 
" do @@ -220,6 +221,7 @@ describe HtmlToMarkdown do it "handles

" do expect(html_to_markdown("

1st paragraph

2nd paragraph

")).to eq("1st paragraph\n\n2nd paragraph") + expect(html_to_markdown("

1st paragraph

\n

2nd paragraph\n 2nd paragraph

\n

3rd paragraph

")).to eq("1st paragraph\n\n2nd paragraph\n2nd paragraph\n\n3rd paragraph") end it "handles
" do