diff --git a/app/mailers/group_smtp_mailer.rb b/app/mailers/group_smtp_mailer.rb index 01ef9b5b17c..15c3d0cf530 100644 --- a/app/mailers/group_smtp_mailer.rb +++ b/app/mailers/group_smtp_mailer.rb @@ -88,7 +88,7 @@ class GroupSmtpMailer < ActionMailer::Base def strip_secure_urls(raw) urls = Set.new - raw.scan(URI.regexp(%w{http https})) { urls << $& } + raw.scan(Discourse::Utils::URI_REGEXP) { urls << $& } urls.each do |url| if (url.start_with?(Discourse.store.s3_upload_host) && FileHelper.is_supported_media?(url)) diff --git a/app/mailers/user_notifications.rb b/app/mailers/user_notifications.rb index 62525eea177..aa700f5c028 100644 --- a/app/mailers/user_notifications.rb +++ b/app/mailers/user_notifications.rb @@ -365,7 +365,7 @@ class UserNotifications < ActionMailer::Base def strip_secure_urls(raw) urls = Set.new - raw.scan(URI.regexp(%w{http https})) { urls << $& } + raw.scan(Discourse::Utils::URI_REGEXP) { urls << $& } urls.each do |url| if (url.start_with?(Discourse.store.s3_upload_host) && FileHelper.is_supported_media?(url)) diff --git a/app/services/search_indexer.rb b/app/services/search_indexer.rb index cb60237ab94..bc9c7ecaf8e 100644 --- a/app/services/search_indexer.rb +++ b/app/services/search_indexer.rb @@ -46,13 +46,6 @@ class SearchIndexer d: search_data[3], } - indexed_data = - if table.to_s == "post" - ranked_params[:d] - else - search_data.select { |d| d.length > 0 }.join(' ') - end - tsvector = DB.query_single("SELECT #{ranked_index}", ranked_params)[0] additional_lexemes = [] @@ -75,6 +68,13 @@ class SearchIndexer tsvector = "#{tsvector} #{additional_lexemes.join(' ')}" + indexed_data = + if table.to_s == "post" + clean_post_raw_data!(ranked_params[:d]) + else + search_data.select { |d| d.length > 0 }.join(' ') + end + params = { raw_data: indexed_data, id: id, @@ -216,6 +216,26 @@ class SearchIndexer end end + def self.clean_post_raw_data!(raw_data) + urls = Set.new + raw_data.scan(Discourse::Utils::URI_REGEXP) { urls << $& } + + urls.each do |url| + begin + case File.extname(URI(url).path || "") + when Oneboxer::VIDEO_REGEX + raw_data.gsub!(url, I18n.t("search.video")) + when Oneboxer::AUDIO_REGEX + raw_data.gsub!(url, I18n.t("search.audio")) + end + rescue URI::InvalidURIError + end + end + + raw_data + end + private_class_method :clean_post_raw_data! + class HtmlScrubber < Nokogiri::XML::SAX::Document attr_reader :scrubbed diff --git a/lib/discourse.rb b/lib/discourse.rb index 5d328826337..2c2d8a8fa48 100644 --- a/lib/discourse.rb +++ b/lib/discourse.rb @@ -24,6 +24,8 @@ module Discourse end class Utils + URI_REGEXP = URI.regexp(%w{http https}) + # Usage: # Discourse::Utils.execute_command("pwd", chdir: 'mydirectory') # or with a block diff --git a/lib/search/grouped_search_results.rb b/lib/search/grouped_search_results.rb index 16260661b55..ba112b50344 100644 --- a/lib/search/grouped_search_results.rb +++ b/lib/search/grouped_search_results.rb @@ -106,23 +106,24 @@ class Search end end - URI_REGEXP = URI.regexp(%w{http https}) - def self.blurb_for(cooked: nil, term: nil, blurb_length: BLURB_LENGTH, scrub: true) blurb = nil - cooked = SearchIndexer.scrub_html_for_search(cooked) if scrub - urls = Set.new - cooked.scan(URI_REGEXP) { urls << $& } - urls.each do |url| - begin - case File.extname(URI(url).path || "") - when Oneboxer::VIDEO_REGEX - cooked.gsub!(url, I18n.t("search.video")) - when Oneboxer::AUDIO_REGEX - cooked.gsub!(url, I18n.t("search.audio")) + if scrub + cooked = SearchIndexer.scrub_html_for_search(cooked) + + urls = Set.new + cooked.scan(Discourse::Utils::URI_REGEXP) { urls << $& } + urls.each do |url| + begin + case File.extname(URI(url).path || "") + when Oneboxer::VIDEO_REGEX + cooked.gsub!(url, I18n.t("search.video")) + when Oneboxer::AUDIO_REGEX + cooked.gsub!(url, I18n.t("search.audio")) + end + rescue URI::InvalidURIError end - rescue URI::InvalidURIError end end diff --git a/spec/services/search_indexer_spec.rb b/spec/services/search_indexer_spec.rb index f55b183e864..c359c32ce63 100644 --- a/spec/services/search_indexer_spec.rb +++ b/spec/services/search_indexer_spec.rb @@ -209,6 +209,29 @@ describe SearchIndexer do "Let me see how I can fix this image white walkers GOT" ) end + + it 'should strips audio and videos URLs from raw data' do + SiteSetting.authorized_extensions = 'mp4' + upload = Fabricate(:video_upload) + + post.update!(raw: <<~RAW) + link to an external page: https://google.com/?u=bar + + link to an audio file: https://somesite.com/audio.m4a + + link to a video file: https://somesite.com/content/somethingelse.MOV + + link to an invalid URL: http:error] + RAW + + expect(post.post_search_data.raw_data).to eq( + "link to an external page: https://google.com/ link to an audio file: #{I18n.t("search.audio")} link to a video file: #{I18n.t("search.video")} link to an invalid URL: http:error]" + ) + + expect(post.post_search_data.search_data).to eq( + "'/audio.m4a':23 '/content/somethingelse.mov':31 'audio':19 'com':15,22,30 'error':38 'extern':13 'file':20,28 'google.com':15 'http':37 'invalid':35 'link':10,16,24,32 'page':14 'somesite.com':22,30 'somesite.com/audio.m4a':21 'somesite.com/content/somethingelse.mov':29 'test':8A 'titl':4A 'uncategor':9B 'url':36 'video':27" + ) + end end describe '.queue_post_reindex' do