PERF: Replace video and audio links in search blurb while indexing.

In the near future, we will be swtiching to PG headlines to generate the
search blurb. As such, we need to replace audio and video links in the
raw data used for headline generation. This also means that we avoid
replacing links each time we need to generate the blurb.
This commit is contained in:
Guo Xiang Tan
2020-08-06 12:25:03 +08:00
parent 06ef87da51
commit 255b0e9f14
6 changed files with 68 additions and 22 deletions

View File

@ -88,7 +88,7 @@ class GroupSmtpMailer < ActionMailer::Base
def strip_secure_urls(raw) def strip_secure_urls(raw)
urls = Set.new urls = Set.new
raw.scan(URI.regexp(%w{http https})) { urls << $& } raw.scan(Discourse::Utils::URI_REGEXP) { urls << $& }
urls.each do |url| urls.each do |url|
if (url.start_with?(Discourse.store.s3_upload_host) && FileHelper.is_supported_media?(url)) if (url.start_with?(Discourse.store.s3_upload_host) && FileHelper.is_supported_media?(url))

View File

@ -365,7 +365,7 @@ class UserNotifications < ActionMailer::Base
def strip_secure_urls(raw) def strip_secure_urls(raw)
urls = Set.new urls = Set.new
raw.scan(URI.regexp(%w{http https})) { urls << $& } raw.scan(Discourse::Utils::URI_REGEXP) { urls << $& }
urls.each do |url| urls.each do |url|
if (url.start_with?(Discourse.store.s3_upload_host) && FileHelper.is_supported_media?(url)) if (url.start_with?(Discourse.store.s3_upload_host) && FileHelper.is_supported_media?(url))

View File

@ -46,13 +46,6 @@ class SearchIndexer
d: search_data[3], d: search_data[3],
} }
indexed_data =
if table.to_s == "post"
ranked_params[:d]
else
search_data.select { |d| d.length > 0 }.join(' ')
end
tsvector = DB.query_single("SELECT #{ranked_index}", ranked_params)[0] tsvector = DB.query_single("SELECT #{ranked_index}", ranked_params)[0]
additional_lexemes = [] additional_lexemes = []
@ -75,6 +68,13 @@ class SearchIndexer
tsvector = "#{tsvector} #{additional_lexemes.join(' ')}" tsvector = "#{tsvector} #{additional_lexemes.join(' ')}"
indexed_data =
if table.to_s == "post"
clean_post_raw_data!(ranked_params[:d])
else
search_data.select { |d| d.length > 0 }.join(' ')
end
params = { params = {
raw_data: indexed_data, raw_data: indexed_data,
id: id, id: id,
@ -216,6 +216,26 @@ class SearchIndexer
end end
end end
def self.clean_post_raw_data!(raw_data)
urls = Set.new
raw_data.scan(Discourse::Utils::URI_REGEXP) { urls << $& }
urls.each do |url|
begin
case File.extname(URI(url).path || "")
when Oneboxer::VIDEO_REGEX
raw_data.gsub!(url, I18n.t("search.video"))
when Oneboxer::AUDIO_REGEX
raw_data.gsub!(url, I18n.t("search.audio"))
end
rescue URI::InvalidURIError
end
end
raw_data
end
private_class_method :clean_post_raw_data!
class HtmlScrubber < Nokogiri::XML::SAX::Document class HtmlScrubber < Nokogiri::XML::SAX::Document
attr_reader :scrubbed attr_reader :scrubbed

View File

@ -24,6 +24,8 @@ module Discourse
end end
class Utils class Utils
URI_REGEXP = URI.regexp(%w{http https})
# Usage: # Usage:
# Discourse::Utils.execute_command("pwd", chdir: 'mydirectory') # Discourse::Utils.execute_command("pwd", chdir: 'mydirectory')
# or with a block # or with a block

View File

@ -106,23 +106,24 @@ class Search
end end
end end
URI_REGEXP = URI.regexp(%w{http https})
def self.blurb_for(cooked: nil, term: nil, blurb_length: BLURB_LENGTH, scrub: true) def self.blurb_for(cooked: nil, term: nil, blurb_length: BLURB_LENGTH, scrub: true)
blurb = nil blurb = nil
cooked = SearchIndexer.scrub_html_for_search(cooked) if scrub
urls = Set.new if scrub
cooked.scan(URI_REGEXP) { urls << $& } cooked = SearchIndexer.scrub_html_for_search(cooked)
urls.each do |url|
begin urls = Set.new
case File.extname(URI(url).path || "") cooked.scan(Discourse::Utils::URI_REGEXP) { urls << $& }
when Oneboxer::VIDEO_REGEX urls.each do |url|
cooked.gsub!(url, I18n.t("search.video")) begin
when Oneboxer::AUDIO_REGEX case File.extname(URI(url).path || "")
cooked.gsub!(url, I18n.t("search.audio")) when Oneboxer::VIDEO_REGEX
cooked.gsub!(url, I18n.t("search.video"))
when Oneboxer::AUDIO_REGEX
cooked.gsub!(url, I18n.t("search.audio"))
end
rescue URI::InvalidURIError
end end
rescue URI::InvalidURIError
end end
end end

View File

@ -209,6 +209,29 @@ describe SearchIndexer do
"Let me see how I can fix this image white walkers GOT" "Let me see how I can fix this image white walkers GOT"
) )
end end
it 'should strips audio and videos URLs from raw data' do
SiteSetting.authorized_extensions = 'mp4'
upload = Fabricate(:video_upload)
post.update!(raw: <<~RAW)
link to an external page: https://google.com/?u=bar
link to an audio file: https://somesite.com/audio.m4a
link to a video file: https://somesite.com/content/somethingelse.MOV
link to an invalid URL: http:error]
RAW
expect(post.post_search_data.raw_data).to eq(
"link to an external page: https://google.com/ link to an audio file: #{I18n.t("search.audio")} link to a video file: #{I18n.t("search.video")} link to an invalid URL: http:error]"
)
expect(post.post_search_data.search_data).to eq(
"'/audio.m4a':23 '/content/somethingelse.mov':31 'audio':19 'com':15,22,30 'error':38 'extern':13 'file':20,28 'google.com':15 'http':37 'invalid':35 'link':10,16,24,32 'page':14 'somesite.com':22,30 'somesite.com/audio.m4a':21 'somesite.com/content/somethingelse.mov':29 'test':8A 'titl':4A 'uncategor':9B 'url':36 'video':27"
)
end
end end
describe '.queue_post_reindex' do describe '.queue_post_reindex' do