PERF: Replace video and audio links in search blurb while indexing.

In the near future, we will be swtiching to PG headlines to generate the search blurb. As such, we need to replace audio and video links in the raw data used for headline generation. This also means that we avoid replacing links each time we need to generate the blurb.
2025-05-24 03:21:14 +08:00 · 2020-08-06 12:25:03 +08:00
parent 06ef87da51
commit 255b0e9f14
6 changed files with 68 additions and 22 deletions
--- a/app/mailers/group_smtp_mailer.rb
+++ b/app/mailers/group_smtp_mailer.rb
@ -88,7 +88,7 @@ class GroupSmtpMailer < ActionMailer::Base
  def strip_secure_urls(raw)
    urls = Set.new
-    raw.scan(URI.regexp(%w{http https})) { urls << $& }
+    raw.scan(Discourse::Utils::URI_REGEXP) { urls << $& }
    urls.each do |url|
      if (url.start_with?(Discourse.store.s3_upload_host) && FileHelper.is_supported_media?(url))
--- a/app/mailers/user_notifications.rb
+++ b/app/mailers/user_notifications.rb
@ -365,7 +365,7 @@ class UserNotifications < ActionMailer::Base
  def strip_secure_urls(raw)
    urls = Set.new
-    raw.scan(URI.regexp(%w{http https})) { urls << $& }
+    raw.scan(Discourse::Utils::URI_REGEXP) { urls << $& }
    urls.each do |url|
      if (url.start_with?(Discourse.store.s3_upload_host) && FileHelper.is_supported_media?(url))
--- a/app/services/search_indexer.rb
+++ b/app/services/search_indexer.rb
@ -46,13 +46,6 @@ class SearchIndexer
      d: search_data[3],
    }
    indexed_data =
      if table.to_s == "post"
        ranked_params[:d]
      else
        search_data.select { |d| d.length > 0 }.join(' ')
      end
    tsvector = DB.query_single("SELECT #{ranked_index}", ranked_params)[0]
    additional_lexemes = []
@ -75,6 +68,13 @@ class SearchIndexer
    tsvector = "#{tsvector} #{additional_lexemes.join(' ')}"
    indexed_data =
      if table.to_s == "post"
        clean_post_raw_data!(ranked_params[:d])
      else
        search_data.select { |d| d.length > 0 }.join(' ')
      end
    params = {
      raw_data: indexed_data,
      id: id,
@ -216,6 +216,26 @@ class SearchIndexer
    end
  end
  def self.clean_post_raw_data!(raw_data)
    urls = Set.new
    raw_data.scan(Discourse::Utils::URI_REGEXP) { urls << $& }
    urls.each do |url|
      begin
        case File.extname(URI(url).path || "")
        when Oneboxer::VIDEO_REGEX
          raw_data.gsub!(url, I18n.t("search.video"))
        when Oneboxer::AUDIO_REGEX
          raw_data.gsub!(url, I18n.t("search.audio"))
        end
      rescue URI::InvalidURIError
      end
    end
    raw_data
  end
  private_class_method :clean_post_raw_data!
  class HtmlScrubber < Nokogiri::XML::SAX::Document
    attr_reader :scrubbed
--- a/lib/discourse.rb
+++ b/lib/discourse.rb
@ -24,6 +24,8 @@ module Discourse
  end
  class Utils
    URI_REGEXP = URI.regexp(%w{http https})
    # Usage:
    #   Discourse::Utils.execute_command("pwd", chdir: 'mydirectory')
    # or with a block
--- a/lib/search/grouped_search_results.rb
+++ b/lib/search/grouped_search_results.rb
@ -106,23 +106,24 @@ class Search
      end
    end
    URI_REGEXP = URI.regexp(%w{http https})
    def self.blurb_for(cooked: nil, term: nil, blurb_length: BLURB_LENGTH, scrub: true)
      blurb = nil
      cooked = SearchIndexer.scrub_html_for_search(cooked) if scrub
-      urls = Set.new
+      if scrub
-      cooked.scan(URI_REGEXP) { urls << $& }
+        cooked = SearchIndexer.scrub_html_for_search(cooked)
-      urls.each do |url|
+
-        begin
+        urls = Set.new
-          case File.extname(URI(url).path || "")
+        cooked.scan(Discourse::Utils::URI_REGEXP) { urls << $& }
-          when Oneboxer::VIDEO_REGEX
+        urls.each do |url|
-            cooked.gsub!(url, I18n.t("search.video"))
+          begin
-          when Oneboxer::AUDIO_REGEX
+            case File.extname(URI(url).path || "")
-            cooked.gsub!(url, I18n.t("search.audio"))
+            when Oneboxer::VIDEO_REGEX
              cooked.gsub!(url, I18n.t("search.video"))
            when Oneboxer::AUDIO_REGEX
              cooked.gsub!(url, I18n.t("search.audio"))
            end
          rescue URI::InvalidURIError
          end
        rescue URI::InvalidURIError
        end
      end
--- a/spec/services/search_indexer_spec.rb
+++ b/spec/services/search_indexer_spec.rb
@ -209,6 +209,29 @@ describe SearchIndexer do
        "Let me see how I can fix this image white walkers GOT"
      )
    end
    it 'should strips audio and videos URLs from raw data' do
      SiteSetting.authorized_extensions = 'mp4'
      upload = Fabricate(:video_upload)
      post.update!(raw: <<~RAW)
      link to an external page: https://google.com/?u=bar
      link to an audio file: https://somesite.com/audio.m4a
      link to a video file: https://somesite.com/content/somethingelse.MOV
      link to an invalid URL: http:error]
      RAW
      expect(post.post_search_data.raw_data).to eq(
        "link to an external page: https://google.com/ link to an audio file: #{I18n.t("search.audio")} link to a video file: #{I18n.t("search.video")} link to an invalid URL: http:error]"
      )
      expect(post.post_search_data.search_data).to eq(
        "'/audio.m4a':23 '/content/somethingelse.mov':31 'audio':19 'com':15,22,30 'error':38 'extern':13 'file':20,28 'google.com':15 'http':37 'invalid':35 'link':10,16,24,32 'page':14 'somesite.com':22,30 'somesite.com/audio.m4a':21 'somesite.com/content/somethingelse.mov':29 'test':8A 'titl':4A 'uncategor':9B 'url':36 'video':27"
      )
    end
  end
  describe '.queue_post_reindex' do