diff --git a/app/helpers/user_notifications_helper.rb b/app/helpers/user_notifications_helper.rb index eb0293183b2..ffbf7352c2a 100644 --- a/app/helpers/user_notifications_helper.rb +++ b/app/helpers/user_notifications_helper.rb @@ -13,7 +13,7 @@ module UserNotificationsHelper end def correct_top_margin(html, desired) - fragment = Nokogiri::HTML.fragment(html) + fragment = Nokogiri::HTML5.fragment(html) if para = fragment.css("p:first").first para["style"] = "margin-top: #{desired};" end @@ -32,7 +32,7 @@ module UserNotificationsHelper end def first_paragraphs_from(html) - doc = Nokogiri::HTML(html) + doc = Nokogiri::HTML5(html) result = +"" length = 0 diff --git a/app/jobs/onceoff/grant_emoji.rb b/app/jobs/onceoff/grant_emoji.rb index 5f85b431b82..5abdb34d5d4 100644 --- a/app/jobs/onceoff/grant_emoji.rb +++ b/app/jobs/onceoff/grant_emoji.rb @@ -14,7 +14,7 @@ module Jobs .where("cooked LIKE '%emoji%'") .find_in_batches do |group| group.each do |p| - doc = Nokogiri::HTML::fragment(p.cooked) + doc = Nokogiri::HTML5::fragment(p.cooked) if (doc.css("img.emoji") - doc.css(".quote img")).size > 0 to_award[p.user_id] ||= { post_id: p.id, created_at: p.created_at } end diff --git a/app/jobs/onceoff/grant_onebox.rb b/app/jobs/onceoff/grant_onebox.rb index 59cf443f4e5..66d2cf26706 100644 --- a/app/jobs/onceoff/grant_onebox.rb +++ b/app/jobs/onceoff/grant_onebox.rb @@ -19,7 +19,7 @@ module Jobs begin # Note we can't use `p.cooked` here because oneboxes have been cooked out cooked = PrettyText.cook(p.raw) - doc = Nokogiri::HTML::fragment(cooked) + doc = Nokogiri::HTML5::fragment(cooked) if doc.search('a.onebox').size > 0 to_award[p.user_id] ||= { post_id: p.id, created_at: p.created_at } end diff --git a/app/jobs/regular/pull_hotlinked_images.rb b/app/jobs/regular/pull_hotlinked_images.rb index 25703659460..e7644e8ca84 100644 --- a/app/jobs/regular/pull_hotlinked_images.rb +++ b/app/jobs/regular/pull_hotlinked_images.rb @@ -157,7 +157,7 @@ module Jobs end def extract_images_from(html) - doc = Nokogiri::HTML::fragment(html) + doc = Nokogiri::HTML5::fragment(html) doc.css("img[src], a.lightbox[href]") - doc.css("img.avatar") - diff --git a/app/jobs/regular/update_username.rb b/app/jobs/regular/update_username.rb index d43c119060d..7c9fcedb5a2 100644 --- a/app/jobs/regular/update_username.rb +++ b/app/jobs/regular/update_username.rb @@ -154,11 +154,11 @@ module Jobs # and there is no reason to invalidate oneboxes, run the post analyzer etc. # when only the username changes. def update_cooked(cooked) - doc = Nokogiri::HTML.fragment(cooked) + doc = Nokogiri::HTML5.fragment(cooked) doc.css("a.mention").each do |a| a.content = a.content.gsub(@cooked_mention_username_regex, "@#{@new_username}") - a["href"] = a["href"].gsub(@cooked_mention_user_path_regex, "/u/#{@new_username}") if a["href"] + a["href"] = a["href"].gsub(@cooked_mention_user_path_regex, "/u/#{URI.escape(@new_username)}") if a["href"] end doc.css("aside.quote").each do |aside| diff --git a/app/models/category.rb b/app/models/category.rb index bbf1e9b98cb..ddab403c5d2 100644 --- a/app/models/category.rb +++ b/app/models/category.rb @@ -306,7 +306,7 @@ class Category < ActiveRecord::Base @@cache_text ||= LruRedux::ThreadSafeCache.new(1000) @@cache_text.getset(self.description) do - text = Nokogiri::HTML.fragment(self.description).text.strip + text = Nokogiri::HTML5.fragment(self.description).text.strip Rack::Utils.escape_html(text).html_safe end end diff --git a/app/models/post.rb b/app/models/post.rb index 5da5d163200..f2ee10543f3 100644 --- a/app/models/post.rb +++ b/app/models/post.rb @@ -953,7 +953,7 @@ class Post < ActiveRecord::Base /\/uploads\/short-url\/[a-zA-Z0-9]+(\.[a-z0-9]+)?/ ] - fragments ||= Nokogiri::HTML::fragment(self.cooked) + fragments ||= Nokogiri::HTML5::fragment(self.cooked) selectors = fragments.css("a/@href", "img/@src", "source/@src", "track/@src", "video/@poster") links = selectors.map do |media| diff --git a/app/models/post_analyzer.rb b/app/models/post_analyzer.rb index 63fe9724b4d..bae36c39f86 100644 --- a/app/models/post_analyzer.rb +++ b/app/models/post_analyzer.rb @@ -131,7 +131,7 @@ class PostAnalyzer def cooked_stripped @cooked_stripped ||= begin - doc = Nokogiri::HTML.fragment(cook(@raw, topic_id: @topic_id)) + doc = Nokogiri::HTML5.fragment(cook(@raw, topic_id: @topic_id)) doc.css("pre .mention, aside.quote > .title, aside.quote .mention, aside.quote .mention-group, .onebox, .elided").remove doc end diff --git a/app/models/quoted_post.rb b/app/models/quoted_post.rb index 03b981e1fb3..9a6a96e9ebf 100644 --- a/app/models/quoted_post.rb +++ b/app/models/quoted_post.rb @@ -9,7 +9,7 @@ class QuotedPost < ActiveRecord::Base # we are double parsing this fragment, this may be worth optimising later def self.extract_from(post) - doc = Nokogiri::HTML.fragment(post.cooked) + doc = Nokogiri::HTML5.fragment(post.cooked) uniq = {} diff --git a/app/models/theme_field.rb b/app/models/theme_field.rb index 351c7b90f1a..a9f98ce4078 100644 --- a/app/models/theme_field.rb +++ b/app/models/theme_field.rb @@ -78,7 +78,7 @@ class ThemeField < ActiveRecord::Base js_compiler = ThemeJavascriptCompiler.new(theme_id, self.theme.name) - doc = Nokogiri::HTML.fragment(html) + doc = Nokogiri::HTML5.fragment(html) doc.css('script[type="text/x-handlebars"]').each do |node| name = node["name"] || node["data-template-name"] || "broken" diff --git a/app/models/topic_embed.rb b/app/models/topic_embed.rb index 43de234a159..a18e5b85dea 100644 --- a/app/models/topic_embed.rb +++ b/app/models/topic_embed.rb @@ -126,7 +126,7 @@ class TopicEmbed < ActiveRecord::Base return end - raw_doc = Nokogiri::HTML(html) + raw_doc = Nokogiri::HTML5(html) auth_element = raw_doc.at('meta[@name="author"]') if auth_element.present? response.author = User.where(username_lower: auth_element[:content].strip).first @@ -142,7 +142,7 @@ class TopicEmbed < ActiveRecord::Base title.strip! end response.title = title - doc = Nokogiri::HTML(read_doc.content) + doc = Nokogiri::HTML5(read_doc.content) tags = { 'img' => 'src', 'script' => 'src', 'a' => 'href' } doc.search(tags.keys.join(',')).each do |node| @@ -198,7 +198,7 @@ class TopicEmbed < ActiveRecord::Base prefix = "#{uri.scheme}://#{uri.host}" prefix += ":#{uri.port}" if uri.port != 80 && uri.port != 443 - fragment = Nokogiri::HTML.fragment("
#{CGI.escapeHTML(display_src)}
" + i.replace(Nokogiri::HTML5.fragment("
#{CGI.escapeHTML(display_src)}
")) rescue URI::Error # If the URL is weird, remove the iframe i.remove @@ -242,7 +242,11 @@ module Email strip_classes_and_ids replace_relative_urls replace_secure_media_urls - @fragment.to_html + include_body? ? @fragment.at("body").to_html : @fragment.at("body").children.to_html + end + + def include_body? + @html =~ /
/i end def strip_avatars_and_emojis diff --git a/lib/onebox/engine/whitelisted_generic_onebox.rb b/lib/onebox/engine/whitelisted_generic_onebox.rb index a10f22e83b4..7a46a0d1e5d 100644 --- a/lib/onebox/engine/whitelisted_generic_onebox.rb +++ b/lib/onebox/engine/whitelisted_generic_onebox.rb @@ -24,7 +24,7 @@ module Onebox return true if WhitelistedGenericOnebox.html_providers.include?(data[:provider_name]) if data[:html]["iframe"] - fragment = Nokogiri::HTML::fragment(data[:html]) + fragment = Nokogiri::HTML5::fragment(data[:html]) if iframe = fragment.at_css("iframe") src = iframe["src"] return src.present? && SiteSetting.allowed_iframes.split("|").any? { |url| src.start_with?(url) } diff --git a/lib/oneboxer.rb b/lib/oneboxer.rb index 4e3ab68cf8d..d3625f19d8b 100644 --- a/lib/oneboxer.rb +++ b/lib/oneboxer.rb @@ -78,7 +78,7 @@ module Oneboxer # Parse URLs out of HTML, returning the document when finished. def self.each_onebox_link(string_or_doc, extra_paths: []) doc = string_or_doc - doc = Nokogiri::HTML::fragment(doc) if doc.is_a?(String) + doc = Nokogiri::HTML5::fragment(doc) if doc.is_a?(String) onebox_links = doc.css("a.#{ONEBOX_CSS_CLASS}", *extra_paths) if onebox_links.present? @@ -94,14 +94,14 @@ module Oneboxer def self.apply(string_or_doc, extra_paths: nil) doc = string_or_doc - doc = Nokogiri::HTML::fragment(doc) if doc.is_a?(String) + doc = Nokogiri::HTML5::fragment(doc) if doc.is_a?(String) changed = false each_onebox_link(doc, extra_paths: extra_paths) do |url, element| onebox, _ = yield(url, element) if onebox - parsed_onebox = Nokogiri::HTML::fragment(onebox) + parsed_onebox = Nokogiri::HTML5::fragment(onebox) next unless parsed_onebox.children.count > 0 if element&.parent&.node_name&.downcase == "p" && diff --git a/lib/post_revisor.rb b/lib/post_revisor.rb index 9da306db091..fd85696cf06 100644 --- a/lib/post_revisor.rb +++ b/lib/post_revisor.rb @@ -579,7 +579,7 @@ class PostRevisor def update_category_description return unless category = Category.find_by(topic_id: @topic.id) - doc = Nokogiri::HTML.fragment(@post.cooked) + doc = Nokogiri::HTML5.fragment(@post.cooked) doc.css("img").remove if html = doc.css("p").first&.inner_html&.strip diff --git a/lib/pretty_text.rb b/lib/pretty_text.rb index 73c18d0099c..9f3f65e3507 100644 --- a/lib/pretty_text.rb +++ b/lib/pretty_text.rb @@ -259,7 +259,7 @@ module PrettyText sanitized = markdown(working_text, options) - doc = Nokogiri::HTML.fragment(sanitized) + doc = Nokogiri::HTML5.fragment(sanitized) if !options[:omit_nofollow] && SiteSetting.add_rel_nofollow_to_user_content add_rel_nofollow_to_user_content(doc) @@ -269,7 +269,11 @@ module PrettyText add_mentions(doc, user_id: opts[:user_id]) end - doc.to_html + scrubber = Loofah::Scrubber.new do |node| + node.remove if node.name == 'script' + end + loofah_fragment = Loofah.fragment(doc.to_html) + loofah_fragment.scrub!(scrubber).to_html end def self.add_rel_nofollow_to_user_content(doc) @@ -282,7 +286,7 @@ module PrettyText doc.css("a").each do |l| href = l["href"].to_s begin - uri = URI(href) + uri = URI(URI.escape(href)) site_uri ||= URI(Discourse.base_url) if !uri.host.present? || @@ -305,7 +309,7 @@ module PrettyText def self.extract_links(html) links = [] - doc = Nokogiri::HTML.fragment(html) + doc = Nokogiri::HTML5.fragment(html) # remove href inside quotes & elided part doc.css("aside.quote a, .elided a").each { |a| a["href"] = "" } @@ -338,7 +342,7 @@ module PrettyText def self.excerpt(html, max_length, options = {}) # TODO: properly fix this HACK in ExcerptParser without introducing XSS - doc = Nokogiri::HTML.fragment(html) + doc = Nokogiri::HTML5.fragment(html) DiscourseEvent.trigger(:reduce_excerpt, doc, options) strip_image_wrapping(doc) strip_oneboxed_media(doc) @@ -350,7 +354,7 @@ module PrettyText return string if string.blank? # If the user is not basic, strip links from their bio - fragment = Nokogiri::HTML.fragment(string) + fragment = Nokogiri::HTML5.fragment(string) fragment.css('a').each { |a| a.replace(a.inner_html) } fragment.to_html end @@ -395,14 +399,14 @@ module PrettyText def self.strip_secure_media(doc) doc.css("a[href]").each do |a| if Upload.secure_media_url?(a["href"]) - target = %w(video audio).include?(a&.parent&.parent&.name) ? a.parent.parent : a + target = %w(video audio).include?(a&.parent&.name) ? a.parent : a target.replace "#{I18n.t("emails.secure_media_placeholder")}
" end end end def self.format_for_email(html, post = nil) - doc = Nokogiri::HTML.fragment(html) + doc = Nokogiri::HTML5.fragment(html) DiscourseEvent.trigger(:reduce_cooked, doc, post) strip_secure_media(doc) if post&.with_secure_media? strip_image_wrapping(doc) @@ -462,13 +466,13 @@ module PrettyText case type when USER_TYPE - element['href'] = "#{Discourse::base_uri}/u/#{name}" + element['href'] = "#{Discourse::base_uri}/u/#{URI.escape(name)}" when GROUP_MENTIONABLE_TYPE element['class'] = 'mention-group notify' - element['href'] = "#{Discourse::base_uri}/groups/#{name}" + element['href'] = "#{Discourse::base_uri}/groups/#{URI.escape(name)}" when GROUP_TYPE element['class'] = 'mention-group' - element['href'] = "#{Discourse::base_uri}/groups/#{name}" + element['href'] = "#{Discourse::base_uri}/groups/#{URI.escape(name)}" end end end diff --git a/lib/quote_comparer.rb b/lib/quote_comparer.rb index 5da2891a7e1..74f39f84a77 100644 --- a/lib/quote_comparer.rb +++ b/lib/quote_comparer.rb @@ -18,7 +18,7 @@ class QuoteComparer def modified? return true if @text.blank? || @parent_post.blank? - parent_text = Nokogiri::HTML::fragment(@parent_post.cooked).text.delete(QuoteComparer.whitespace) + parent_text = Nokogiri::HTML5::fragment(@parent_post.cooked).text.delete(QuoteComparer.whitespace) text = @text.delete(QuoteComparer.whitespace) !parent_text.include?(text) diff --git a/lib/retrieve_title.rb b/lib/retrieve_title.rb index 3c55ec7aecf..227da9f0cbd 100644 --- a/lib/retrieve_title.rb +++ b/lib/retrieve_title.rb @@ -11,7 +11,7 @@ module RetrieveTitle def self.extract_title(html) title = nil - if doc = Nokogiri::HTML(html) + if doc = Nokogiri::HTML5(html) title = doc.at('title')&.inner_text diff --git a/lib/reviewable/conversation.rb b/lib/reviewable/conversation.rb index 53eba48ef1a..696959dd922 100644 --- a/lib/reviewable/conversation.rb +++ b/lib/reviewable/conversation.rb @@ -17,7 +17,7 @@ class Reviewable < ActiveRecord::Base def self.excerpt(cooked) excerpt = ::Post.excerpt(cooked, 250, keep_emoji_images: true) # remove the first link if it's the first node - fragment = Nokogiri::HTML.fragment(excerpt) + fragment = Nokogiri::HTML5.fragment(excerpt) if fragment.children.first == fragment.css("a:first").first && fragment.children.first fragment.children.first.remove end diff --git a/lib/tasks/emoji.rake b/lib/tasks/emoji.rake index ad05c0d1b24..ace173d55c2 100644 --- a/lib/tasks/emoji.rake +++ b/lib/tasks/emoji.rake @@ -353,7 +353,7 @@ def generate_emoji_groups(keywords, sections) puts "Generating groups..." list = open(EMOJI_ORDERING_URL).read - doc = Nokogiri::HTML(list) + doc = Nokogiri::HTML5(list) table = doc.css("table")[0] EMOJI_GROUPS.map do |group| diff --git a/plugins/discourse-details/spec/components/pretty_text_spec.rb b/plugins/discourse-details/spec/components/pretty_text_spec.rb index aa768305ecf..dd0cf14a722 100644 --- a/plugins/discourse-details/spec/components/pretty_text_spec.rb +++ b/plugins/discourse-details/spec/components/pretty_text_spec.rb @@ -8,7 +8,7 @@ describe PrettyText do let(:post) { Fabricate(:post) } it "supports details tag" do - cooked_html = <<~HTML + cooked_html = <<~HTML.gsub("\n", "")Link
Google
@@ -1242,7 +1228,7 @@ describe CookedPostProcessor do
it "uses schemaless CDN url for http uploads" do
Rails.configuration.action_controller.stubs(:asset_host).returns("http://my.cdn.com")
cpp.optimize_urls
- expect(cpp.html).to match_html <<~HTML
+ expect(cpp.html).to match_html <<~HTML.rstrip
Link
Google
@@ -1255,7 +1241,7 @@ describe CookedPostProcessor do
it "doesn't use schemaless CDN url for https uploads" do
Rails.configuration.action_controller.stubs(:asset_host).returns("https://my.cdn.com")
cpp.optimize_urls
- expect(cpp.html).to match_html <<~HTML
+ expect(cpp.html).to match_html <<~HTML.rstrip
Link
Google
@@ -1269,7 +1255,7 @@ describe CookedPostProcessor do
SiteSetting.login_required = true
Rails.configuration.action_controller.stubs(:asset_host).returns("http://my.cdn.com")
cpp.optimize_urls
- expect(cpp.html).to match_html <<~HTML
+ expect(cpp.html).to match_html <<~HTML.rstrip
Link
Google
@@ -1283,7 +1269,7 @@ describe CookedPostProcessor do
SiteSetting.prevent_anons_from_downloading_files = true
Rails.configuration.action_controller.stubs(:asset_host).returns("http://my.cdn.com")
cpp.optimize_urls
- expect(cpp.html).to match_html <<~HTML
+ expect(cpp.html).to match_html <<~HTML.rstrip
Link
Google
@@ -1318,7 +1304,7 @@ describe CookedPostProcessor do
cpp = CookedPostProcessor.new(the_post)
cpp.optimize_urls
- expect(cpp.html).to match_html <<~HTML
+ expect(cpp.html).to match_html <<~HTML.rstrip
This post has a local emoji and an external upload
This post has a local emoji and an external upload
This post has an S3 video onebox:
This post has an S3 video onebox:
+
This post has an S3 video onebox:
This post has a video upload.
This post has a video upload.
This post has an audio upload.
-
This post has an audio upload.
+
And an image upload.