DEV: Extract out html cleanup so it can be used on other types of cooked content (#31385)

`PrettyText.cook` does two things: 1️⃣ convert raw to cooked, 2️⃣
partial sanitisation.

This commit splits the 2️⃣ up so that it can be applied to other cooked
content.
This commit is contained in:
Natalie Tay
2025-02-25 10:36:36 +08:00
committed by GitHub
parent d4e5d63d7e
commit ae5ad250f6
2 changed files with 43 additions and 16 deletions

View File

@ -295,25 +295,13 @@ module PrettyText
JS
end
def self.cook(text, opts = {})
def self.cook(raw, opts = {})
options = opts.dup
working_text = text.dup
working_text = raw.dup
sanitized = markdown(working_text, options)
html = markdown(working_text, options)
doc = Nokogiri::HTML5.fragment(sanitized)
add_nofollow = !options[:omit_nofollow] && SiteSetting.add_rel_nofollow_to_user_content
add_rel_attributes_to_user_content(doc, add_nofollow)
strip_hidden_unicode_bidirectional_characters(doc)
sanitize_hotlinked_media(doc)
add_video_placeholder_image(doc)
add_mentions(doc, user_id: opts[:user_id]) if SiteSetting.enable_mentions
scrubber = Loofah::Scrubber.new { |node| node.remove if node.name == "script" }
loofah_fragment = Loofah.html5_fragment(doc.to_html)
loofah_fragment.scrub!(scrubber).to_html
cleanup(html, opts)
end
def self.strip_hidden_unicode_bidirectional_characters(doc)
@ -692,6 +680,22 @@ module PrettyText
rval
end
def self.cleanup(html, opts = {})
doc = Nokogiri::HTML5.fragment(html)
add_nofollow = !opts[:omit_nofollow] && SiteSetting.add_rel_nofollow_to_user_content
add_rel_attributes_to_user_content(doc, add_nofollow)
strip_hidden_unicode_bidirectional_characters(doc)
sanitize_hotlinked_media(doc)
add_video_placeholder_image(doc)
add_mentions(doc, user_id: opts[:user_id]) if SiteSetting.enable_mentions
scrubber = Loofah::Scrubber.new { |node| node.remove if node.name == "script" }
loofah_fragment = Loofah.html5_fragment(doc.to_html)
loofah_fragment.scrub!(scrubber).to_html
end
private
USER_TYPE = "user"

View File

@ -731,6 +731,26 @@ RSpec.describe PrettyText do
expect(cooked).to eq(html.strip)
end
it "strips out unicode bidirectional (bidi) override characters and replaces with a highlighted span" do
cooked = <<~HTML
<p>X</p>
<pre><code class="lang-auto">var isAdmin = false;
\u202E
</code></pre>
HTML
cleaned = PrettyText.cleanup(cooked)
hidden_bidi_title = I18n.t("post.hidden_bidi_character")
html = <<~HTML
<p>X</p>
<pre><code class="lang-auto">var isAdmin = false;
<span class="bidi-warning" title="#{hidden_bidi_title}">&lt;U+202E&gt;</span>
</code></pre>
HTML
expect(cleaned.strip).to eq(html.strip)
end
it "fuzzes all possible dangerous unicode bidirectional (bidi) override characters, making sure they are replaced" do
bad_bidi = [
"\u202A",
@ -2389,6 +2409,9 @@ HTML
it "should strip SCRIPT" do
expect(PrettyText.cook("<script>alert(42)</script>")).to eq ""
expect(PrettyText.cook("<div><script>alert(42)</script></div>")).to eq "<div></div>"
expect(PrettyText.cleanup("<script>alert(42)</script>")).to eq ""
expect(PrettyText.cleanup("<div><script>alert(42)</script></div>")).to eq "<div></div>"
end
it "strips script regardless of sanitize" do