FEATURE: add support for figure and figcaption tags in embeddings (#21276)

Many blog posts use these to illustrate and images were previously omitted

Additionally strip superfluous HTML and BODY tags from embed HTML.

This was incorrectly returned from server.
This commit is contained in:
Sam
2023-04-27 19:57:06 +10:00
committed by GitHub
parent 0b479d0137
commit 2ccc5fc66e
2 changed files with 50 additions and 10 deletions

View File

@ -114,17 +114,27 @@ class TopicEmbed < ActiveRecord::Base
end end
def self.find_remote(url) def self.find_remote(url)
require "ruby-readability"
url = UrlHelper.normalized_encode(url) url = UrlHelper.normalized_encode(url)
original_uri = URI.parse(url) URI.parse(url) # ensure url parses, will raise if not
fd = FinalDestination.new(url, validate_uri: true, max_redirects: 5, follow_canonical: true) fd = FinalDestination.new(url, validate_uri: true, max_redirects: 5, follow_canonical: true)
uri = fd.resolve uri = fd.resolve
return if uri.blank? return if uri.blank?
begin
html = uri.read
rescue OpenURI::HTTPError, Net::OpenTimeout
return
end
parse_html(html, url)
end
def self.parse_html(html, url)
require "ruby-readability"
opts = { opts = {
tags: %w[div p code pre h1 h2 h3 b em i strong a img ul li ol blockquote], tags: %w[div p code pre h1 h2 h3 b em i strong a img ul li ol blockquote figure figcaption],
attributes: %w[href src class], attributes: %w[href src class],
remove_empty_nodes: false, remove_empty_nodes: false,
} }
@ -139,11 +149,6 @@ class TopicEmbed < ActiveRecord::Base
SiteSetting.allowed_embed_classnames if SiteSetting.allowed_embed_classnames.present? SiteSetting.allowed_embed_classnames if SiteSetting.allowed_embed_classnames.present?
response = FetchResponse.new response = FetchResponse.new
begin
html = uri.read
rescue OpenURI::HTTPError, Net::OpenTimeout
return
end
raw_doc = Nokogiri.HTML5(html) raw_doc = Nokogiri.HTML5(html)
auth_element = auth_element =
@ -200,7 +205,7 @@ class TopicEmbed < ActiveRecord::Base
end end
end end
response.body = doc.to_html response.body = doc.at("body").children.to_html
response response
end end

View File

@ -23,6 +23,41 @@ RSpec.describe TopicEmbed do
expect(TopicEmbed.count).to eq(0) expect(TopicEmbed.count).to eq(0)
end end
it "Allows figure and figcaption HTML tags" do
html = <<~HTML
<html>
<head>
<title>Some title</title>
</head>
<body>
<div class='content'>
<p>some content</p>
<figure>
<img src="/a.png">
<figcaption>Some caption</figcaption>
<figure>
</div>
</body>
</html>
HTML
parsed = TopicEmbed.parse_html(html, "https://blog.discourse.com/somepost.html")
# div inception is inserted by the readability gem
expected = <<~HTML
<div><div>
<div>
<p>some content</p>
<figure>
<img src="https://blog.discourse.com/a.png">
<figcaption>Some caption</figcaption>
<figure>
</figure></figure></div>
</div></div>
HTML
expect(parsed.body.strip).to eq(expected.strip)
end
context "when creating a post" do context "when creating a post" do
let!(:post) { TopicEmbed.import(user, url, title, contents) } let!(:post) { TopicEmbed.import(user, url, title, contents) }
let(:topic_embed) { TopicEmbed.find_by(post: post) } let(:topic_embed) { TopicEmbed.find_by(post: post) }