pull hotlinked images

2025-05-22 16:34:31 +08:00 · 2013-11-05 19:04:47 +01:00
parent 8724b2e2b6
commit 37fd7ab574
25 changed files with 1076 additions and 827 deletions
--- a/lib/cooked_post_processor.rb
+++ b/lib/cooked_post_processor.rb
@ -1,7 +1,7 @@
 # Post processing that we can do after a post has already been cooked.
 # For example, inserting the onebox content, or image sizes/thumbnails.

-require_dependency 'oneboxer'
+require_dependency "oneboxer"

 class CookedPostProcessor
  include ActionView::Helpers::NumberHelper
@ -12,27 +12,38 @@ class CookedPostProcessor
    @post = post
    @doc = Nokogiri::HTML::fragment(post.cooked)
    @size_cache = {}
-    @has_been_uploaded_cache = {}
  end

  def post_process
-    clean_up_reverse_index
-    post_process_attachments
+    keep_reverse_index_up_to_date
    post_process_images
    post_process_oneboxes
+    optimize_urls
+    pull_hotlinked_images
  end

-  def clean_up_reverse_index
-    PostUpload.delete_all(post_id: @post.id)
-  end
+  def keep_reverse_index_up_to_date
+    upload_ids = Set.new

-  def post_process_attachments
-    attachments.each do |attachment|
-      href = attachment['href']
-      attachment['href'] = relative_to_absolute(href)
-      # update reverse index
+    @doc.search("a").each do |a|
+      href = a["href"].to_s
      if upload = Upload.get_from_url(href)
-        associate_to_post(upload)
+        upload_ids << upload.id
+      end
+    end
+
+    @doc.search("img").each do |img|
+      src = img["src"].to_s
+      if upload = Upload.get_from_url(src)
+        upload_ids << upload.id
+      end
+    end
+
+    values = upload_ids.map{ |u| "(#{@post.id},#{u})" }.join(",")
+    PostUpload.transaction do
+      PostUpload.delete_all(post_id: @post.id)
+      if upload_ids.length > 0
+        PostUpload.exec_sql("INSERT INTO post_uploads (post_id, upload_id) VALUES #{values}")
      end
    end
  end
@ -42,78 +53,53 @@ class CookedPostProcessor
    return if images.blank?

    images.each do |img|
-      if img['src'].present?
-        # keep track of the original src
-        src = img['src']
-        # make sure the src is absolute (when working with locally uploaded files)
-        img['src'] = relative_to_absolute(src)
-        # make sure the img has proper width and height attributes
-        update_dimensions!(img)
-        # retrieve the associated upload, if any
-        if upload = Upload.get_from_url(src)
-          # update reverse index
-          associate_to_post(upload)
-        end
-        # lightbox treatment
-        convert_to_link!(img, upload)
-        # mark the post as dirty whenever the src has changed
-        @dirty |= src != img['src']
-      end
+      src, width, height = img["src"], img["width"], img["height"]
+      limit_size!(img)
+      convert_to_link!(img)
+      @dirty |= (src != img["src"]) || (width.to_i != img["width"].to_i) || (height.to_i != img["height"].to_i)
    end

-    # Extract the first image from the first post and use it as the 'topic image'
-    extract_topic_image(images)
-  end
-
-  def post_process_oneboxes
-    args = { post_id: @post.id }
-    args[:invalidate_oneboxes] = true if @opts[:invalidate_oneboxes]
-    # bake onebox content into the post
-    result = Oneboxer.apply(@doc) do |url, element|
-      Oneboxer.onebox(url, args)
-    end
-    # mark the post as dirty whenever a onebox as been baked
-    @dirty |= result.changed?
+    update_topic_image(images)
  end

  def extract_images
-    # do not extract images inside a onebox or a quote
+    # do not extract images inside oneboxes or quotes
    @doc.css("img") - @doc.css(".onebox-result img") - @doc.css(".quote img")
  end

-  def relative_to_absolute(src)
-    if src =~ /^\/[^\/]/
-      Discourse.base_url_no_prefix + src
-    else
-      src
+  def limit_size!(img)
+    w, h = get_size_from_image_sizes(img["src"], @opts[:image_sizes]) || get_size(img["src"])
+    # limit the size of the thumbnail
+    img["width"], img["height"] = ImageSizer.resize(w, h)
+  end
+
+  def get_size_from_image_sizes(src, image_sizes)
+    return unless image_sizes.present?
+    image_sizes.each do |image_size|
+      url, size = image_size[0], image_size[1]
+      return [size["width"], size["height"]] if url.include?(src)
    end
  end

-  def update_dimensions!(img)
-    w, h = get_size_from_image_sizes(img['src'], @opts[:image_sizes]) || get_size(img['src'])
-    # make sure we limit the size of the thumbnail
-    w, h = ImageSizer.resize(w, h)
-    # check whether the dimensions have changed
-    @dirty = (img['width'].to_i != w) || (img['height'].to_i != h)
-    # update the dimensions
-    img['width'] = w
-    img['height'] = h
+  def get_size(url)
+    absolute_url = url
+    absolute_url = Discourse.base_url_no_prefix + absolute_url if absolute_url =~ /^\/[^\/]/
+    # FastImage fails when there's no scheme
+    absolute_url = (SiteSetting.use_ssl? ? "https:" : "http:") + absolute_url if absolute_url.start_with?("//")
+    return unless is_valid_image_url?(absolute_url)
+    # we can *always* crawl our own images
+    return unless SiteSetting.crawl_images? || Discourse.store.has_been_uploaded?(url)
+    @size_cache[url] ||= FastImage.size(absolute_url)
+  rescue Zlib::BufError # FastImage.size raises BufError for some gifs
  end

-  def associate_to_post(upload)
-    return if PostUpload.where(post_id: @post.id, upload_id: upload.id).count > 0
-    PostUpload.create(post_id: @post.id, upload_id: upload.id)
-  rescue ActiveRecord::RecordNotUnique
-    # do not care if it's already associated
+  def is_valid_image_url?(url)
+    uri = URI.parse(url)
+    %w(http https).include? uri.scheme
+  rescue URI::InvalidURIError
  end

-  def optimize_image!(img)
-    # TODO
-    # 1) optimize using image_optim
-    # 2) .png vs. .jpg (> 1.5x)
-  end
-
-  def convert_to_link!(img, upload=nil)
+  def convert_to_link!(img)
    src = img["src"]
    return unless src.present?

@ -123,12 +109,10 @@ class CookedPostProcessor
    return if original_width.to_i <= width && original_height.to_i <= height
    return if original_width.to_i <= SiteSetting.max_image_width && original_height.to_i <= SiteSetting.max_image_height

-    return if is_a_hyperlink(img)
+    return if is_a_hyperlink?(img)

-    if upload
-      # create a thumbnail
+    if upload = Upload.get_from_url(src)
      upload.create_thumbnail!(width, height)
-      # optimize image
      # TODO: optimize_image!(img)
    end

@ -137,7 +121,7 @@ class CookedPostProcessor
    @dirty = true
  end

-  def is_a_hyperlink(img)
+  def is_a_hyperlink?(img)
    parent = img.parent
    while parent
      return if parent.name == "a"
@ -155,21 +139,20 @@ class CookedPostProcessor
    # then, the link to our larger image
    a = Nokogiri::XML::Node.new("a", @doc)
    img.add_next_sibling(a)
-    a["href"] = img['src']
+    a["href"] = img["src"]
    a["class"] = "lightbox"
    a.add_child(img)

    # replace the image by its thumbnail
-    w = img["width"]
-    h = img["height"]
-    img['src'] = relative_to_absolute(upload.thumbnail(w, h).url) if upload && upload.has_thumbnail?(w, h)
+    w, h = img["width"].to_i, img["height"].to_i
+    img["src"] = upload.thumbnail(w, h).url if upload && upload.has_thumbnail?(w, h)

    # then, some overlay informations
    meta = Nokogiri::XML::Node.new("div", @doc)
    meta["class"] = "meta"
    img.add_next_sibling(meta)

-    filename = get_filename(upload, img['src'])
+    filename = get_filename(upload, img["src"])
    informations = "#{original_width}x#{original_height}"
    informations << " #{number_to_human_size(upload.filesize)}" if upload

@ -181,48 +164,70 @@ class CookedPostProcessor
  def get_filename(upload, src)
    return File.basename(src) unless upload
    return upload.original_filename unless upload.original_filename =~ /^blob(\.png)?$/i
-    return I18n.t('upload.pasted_image_filename')
+    return I18n.t("upload.pasted_image_filename")
  end

  def create_span_node(klass, content=nil)
    span = Nokogiri::XML::Node.new("span", @doc)
    span.content = content if content
-    span['class'] = klass
+    span["class"] = klass
    span
  end

-  def extract_topic_image(images)
+  def update_topic_image(images)
    if @post.post_number == 1
      img = images.first
-      @post.topic.update_column :image_url, img['src'] if img['src'].present?
+      @post.topic.update_column(:image_url, img["src"]) if img["src"].present?
    end
  end

-  def get_size_from_image_sizes(src, image_sizes)
-    [image_sizes[src]["width"], image_sizes[src]["height"]] if image_sizes.present? && image_sizes[src].present?
+  def post_process_oneboxes
+    args = {
+      post_id: @post.id,
+      invalidate_oneboxes: !!@opts[:invalidate_oneboxes],
+    }
+
+    result = Oneboxer.apply(@doc) do |url, element|
+      Oneboxer.onebox(url, args)
+    end
+
+    @dirty |= result.changed?
  end

-  def get_size(url)
-    uri = url
-    # make sure urls have a scheme (otherwise, FastImage will fail)
-    uri = (SiteSetting.use_ssl? ? "https:" : "http:") + url if url && url.start_with?("//")
-    return unless is_valid_image_uri?(uri)
-    # we can *always* crawl our own images
-    return unless SiteSetting.crawl_images? || Discourse.store.has_been_uploaded?(url)
-    @size_cache[url] ||= FastImage.size(uri)
-  rescue Zlib::BufError # FastImage.size raises BufError for some gifs
+  def optimize_urls
+    @doc.search("a").each do |a|
+      href = a["href"].to_s
+      if Discourse.store.has_been_uploaded?(href)
+        a["href"] = schemaless relative_to_absolute(href)
+      end
+    end
+
+    @doc.search("img").each do |img|
+      src = img["src"].to_s
+      if Discourse.store.has_been_uploaded?(src)
+        img["src"] = schemaless relative_to_absolute(src)
+      end
+    end
  end

-  def is_valid_image_uri?(url)
-    uri = URI.parse(url)
-    %w(http https).include? uri.scheme
-  rescue URI::InvalidURIError
+  def relative_to_absolute(url)
+    url =~ /^\/[^\/]/ ? (Discourse.asset_host || Discourse.base_url_no_prefix) + url : url
  end

-  def attachments
-    attachments = @doc.css("a.attachment[href^=\"#{Discourse.store.absolute_base_url}\"]")
-    attachments += @doc.css("a.attachment[href^=\"#{Discourse.store.relative_base_url}\"]") if Discourse.store.internal?
-    attachments
+  def schemaless(url)
+    url.gsub(/^https?:/, "")
+  end
+
+  def pull_hotlinked_images
+    # we don't want to run the job if we're not allowed to crawl images
+    return unless SiteSetting.crawl_images?
+    # we only want to run the job whenever it's changed by a user
+    return if @post.updated_by == Discourse.system_user
+    # make sure no other job is scheduled
+    Jobs.cancel_scheduled_job(:pull_hotlinked_images, post_id: @post.id)
+    # schedule the job
+    delay = SiteSetting.ninja_edit_window + 1
+    Jobs.enqueue_in(delay.minutes.to_i, :pull_hotlinked_images, post_id: @post.id)
  end

  def dirty?