From bf6f8299a781922737fa01c408078f4c090885b6 Mon Sep 17 00:00:00 2001 From: David Taylor Date: Mon, 16 May 2022 17:56:00 +0100 Subject: [PATCH] FEATURE: Pull hotlinked images immediately after posting Previously, with the default `editing_grace_period`, hotlinked images were pulled 5 minutes after a post is created. This delay was added to reduce the chance of automated edits clashing with user edits. This commit refactors things so that we can pull hotlinked images immediately. URLs are immediately updated in the post's `cooked` HTML. The post's raw markdown is updated later, after the `editing_grace_period`. This involves a number of behind-the-scenes changes including: - Schedule Jobs::PullHotlinkedImages immediately after Jobs::ProcessPost. Move scheduling to after the `update_column` call to avoid race conditions - Move raw changes into a separate job, which is delayed until after the ninja-edit window - Move disable_if_low_on_disk_space logic into the `pull_hotlinked_images` job - Move raw-parsing/replacing logic into `InlineUpload` so it can be easily be shared between `UpdateHotlinkedRaw` and `PullUserProfileHotlinkedImages` --- app/jobs/regular/process_post.rb | 7 ++ app/jobs/regular/pull_hotlinked_images.rb | 93 +++++++----------- .../pull_user_profile_hotlinked_images.rb | 10 +- app/jobs/regular/update_hotlinked_raw.rb | 28 ++++++ app/services/inline_uploads.rb | 34 +++++++ lib/cooked_post_processor.rb | 36 ------- spec/jobs/process_post_spec.rb | 31 ++++++ spec/jobs/pull_hotlinked_images_spec.rb | 56 ++++++++++- spec/lib/cooked_post_processor_spec.rb | 97 ------------------- 9 files changed, 193 insertions(+), 199 deletions(-) create mode 100644 app/jobs/regular/update_hotlinked_raw.rb diff --git a/app/jobs/regular/process_post.rb b/app/jobs/regular/process_post.rb index b573f0683c8..60f444fe981 100644 --- a/app/jobs/regular/process_post.rb +++ b/app/jobs/regular/process_post.rb @@ -40,6 +40,8 @@ module Jobs end end + enqueue_pull_hotlinked_images(post) unless args[:skip_pull_hotlinked_images] + if !post.user&.staff? && !post.user&.staged? s = post.raw s << " #{post.topic.title}" if post.post_number == 1 @@ -60,6 +62,11 @@ module Jobs TopicLink.extract_from(post) QuotedPost.extract_from(post) end + + def enqueue_pull_hotlinked_images(post) + Jobs.cancel_scheduled_job(:pull_hotlinked_images, post_id: post.id) + Jobs.enqueue(:pull_hotlinked_images, post_id: post.id) + end end end diff --git a/app/jobs/regular/pull_hotlinked_images.rb b/app/jobs/regular/pull_hotlinked_images.rb index ba3389843ea..6b5d6d2ddbd 100644 --- a/app/jobs/regular/pull_hotlinked_images.rb +++ b/app/jobs/regular/pull_hotlinked_images.rb @@ -10,16 +10,13 @@ module Jobs end def execute(args) + disable_if_low_on_disk_space + @post_id = args[:post_id] raise Discourse::InvalidParameters.new(:post_id) if @post_id.blank? post = Post.find_by(id: @post_id) - return if post.blank? - return if post.topic.blank? - return if post.cook_method == Post.cook_methods[:raw_html] - - raw = post.raw.dup - start_raw = raw.dup + return if post.nil? || post.topic.nil? hotlinked_map = post.post_hotlinked_media.map { |r| [r.url, r] }.to_h @@ -55,30 +52,23 @@ module Jobs changed_hotlink_records = true hotlink_record.save! end - - # have we successfully downloaded that file? - if upload = hotlink_record&.upload - raw = replace_in_raw(original_src: original_src, upload: upload, raw: raw) - end rescue => e raise e if Rails.env.test? log(:error, "Failed to pull hotlinked image (#{download_src}) post: #{@post_id}\n" + e.message + "\n" + e.backtrace.join("\n")) end - # If post changed while we were downloading images, never apply edits - post.reload - post_changed_elsewhere = (start_raw != post.raw) - raw_changed_here = (raw != post.raw) - - if !post_changed_elsewhere && raw_changed_here - changes = { raw: raw, edit_reason: I18n.t("upload.edit_reason") } - post.revise(Discourse.system_user, changes, bypass_bump: true, skip_staff_log: true) - elsif changed_hotlink_records + if changed_hotlink_records post.trigger_post_process( bypass_bump: true, skip_pull_hotlinked_images: true # Avoid an infinite loop of job scheduling ) end + + if hotlinked_map.size > 0 + Jobs.cancel_scheduled_job(:update_hotlinked_raw, post_id: post.id) + update_raw_delay = SiteSetting.editing_grace_period + 1 + Jobs.enqueue_in(update_raw_delay, :update_hotlinked_raw, post_id: post.id) + end end def download(src) @@ -137,45 +127,6 @@ module Jobs end end - def replace_in_raw(original_src:, raw:, upload:) - raw = raw.dup - escaped_src = Regexp.escape(original_src) - - replace_raw = ->(match, match_src, replacement, _index) { - if normalize_src(original_src) == normalize_src(match_src) - replacement = - if replacement.include?(InlineUploads::PLACEHOLDER) - replacement.sub(InlineUploads::PLACEHOLDER, upload.short_url) - elsif replacement.include?(InlineUploads::PATH_PLACEHOLDER) - replacement.sub(InlineUploads::PATH_PLACEHOLDER, upload.short_path) - end - - raw = raw.gsub( - match, - replacement - ) - end - } - - # there are 6 ways to insert an image in a post - # HTML tag - - InlineUploads.match_img(raw, external_src: true, &replace_raw) - - # BBCode tag - [img]http://...[/img] - InlineUploads.match_bbcode_img(raw, external_src: true, &replace_raw) - - # Markdown linked image - [![alt](http://...)](http://...) - # Markdown inline - ![alt](http://...) - # Markdown inline - ![](http://... "image title") - # Markdown inline - ![alt](http://... "image title") - InlineUploads.match_md_inline_img(raw, external_src: true, &replace_raw) - - # Direct link - raw.gsub!(/^#{escaped_src}(\s?)$/) { "![](#{upload.short_url})#{$1}" } - - raw - end - def extract_images_from(html) doc = Nokogiri::HTML5::fragment(html) @@ -239,6 +190,30 @@ module Jobs def normalize_src(src) PostHotlinkedMedia.normalize_src(src) end + + def disable_if_low_on_disk_space + return if Discourse.store.external? + return if !SiteSetting.download_remote_images_to_local + return if available_disk_space >= SiteSetting.download_remote_images_threshold + + SiteSetting.download_remote_images_to_local = false + + # log the site setting change + reason = I18n.t("disable_remote_images_download_reason") + staff_action_logger = StaffActionLogger.new(Discourse.system_user) + staff_action_logger.log_site_setting_change("download_remote_images_to_local", true, false, details: reason) + + # also send a private message to the site contact user notify_about_low_disk_space + notify_about_low_disk_space + end + + def notify_about_low_disk_space + SystemMessage.create_from_system_user(Discourse.site_contact_user, :download_remote_images_disabled) + end + + def available_disk_space + 100 - DiskSpace.percent_free("#{Rails.root}/public/uploads") + end end end diff --git a/app/jobs/regular/pull_user_profile_hotlinked_images.rb b/app/jobs/regular/pull_user_profile_hotlinked_images.rb index ae87a9043d7..66280f2f7eb 100644 --- a/app/jobs/regular/pull_user_profile_hotlinked_images.rb +++ b/app/jobs/regular/pull_user_profile_hotlinked_images.rb @@ -30,16 +30,16 @@ module Jobs rescue ImageBrokenError broken_image_urls << normalized_src end - - # have we successfully downloaded that file? - if upload = downloaded_images[normalized_src] - user_profile.bio_raw = replace_in_raw(original_src: original_src, upload: upload, raw: user_profile.bio_raw) - end rescue => e raise e if Rails.env.test? log(:error, "Failed to pull hotlinked image (#{download_src}) user: #{@user_id}\n" + e.message + "\n" + e.backtrace.join("\n")) end + user_profile.bio_raw = InlineUploads.replace_hotlinked_image_urls(raw: user_profile.bio_raw) do |match_src| + normalized_match_src = PostHotlinkedMedia.normalize_src(match_src) + downloaded_images[normalized_match_src] + end + user_profile.skip_pull_hotlinked_image = true user_profile.save! end diff --git a/app/jobs/regular/update_hotlinked_raw.rb b/app/jobs/regular/update_hotlinked_raw.rb new file mode 100644 index 00000000000..b938a1ffaae --- /dev/null +++ b/app/jobs/regular/update_hotlinked_raw.rb @@ -0,0 +1,28 @@ +# frozen_string_literal: true + +module Jobs + class UpdateHotlinkedRaw < ::Jobs::Base + sidekiq_options queue: 'low' + + def execute(args) + @post_id = args[:post_id] + raise Discourse::InvalidParameters.new(:post_id) if @post_id.blank? + + post = Post.find_by(id: @post_id) + return if post.nil? + return if post.cook_method == Post.cook_methods[:raw_html] + + hotlinked_map = post.post_hotlinked_media.preload(:upload).map { |r| [r.url, r] }.to_h + + raw = InlineUploads.replace_hotlinked_image_urls(raw: post.raw) do |match_src| + normalized_match_src = PostHotlinkedMedia.normalize_src(match_src) + hotlinked_map[normalized_match_src]&.upload + end + + if post.raw != raw + changes = { raw: raw, edit_reason: I18n.t("upload.edit_reason") } + post.revise(Discourse.system_user, changes, bypass_bump: true, skip_staff_log: true) + end + end + end +end diff --git a/app/services/inline_uploads.rb b/app/services/inline_uploads.rb index 55a162a8ea4..b2519f6331f 100644 --- a/app/services/inline_uploads.rb +++ b/app/services/inline_uploads.rb @@ -214,6 +214,40 @@ class InlineUploads end end + def self.replace_hotlinked_image_urls(raw:, &blk) + replace = Proc.new do |match, match_src, replacement, _index| + upload = blk.call(match_src) + next if !upload + + replacement = + if replacement.include?(InlineUploads::PLACEHOLDER) + replacement.sub(InlineUploads::PLACEHOLDER, upload.short_url) + elsif replacement.include?(InlineUploads::PATH_PLACEHOLDER) + replacement.sub(InlineUploads::PATH_PLACEHOLDER, upload.short_path) + end + + raw = raw.gsub( + match, + replacement + ) + end + + # there are 6 ways to insert an image in a post + # HTML tag - + InlineUploads.match_img(raw, external_src: true, &replace) + + # BBCode tag - [img]http://...[/img] + InlineUploads.match_bbcode_img(raw, external_src: true, &replace) + + # Markdown linked image - [![alt](http://...)](http://...) + # Markdown inline - ![alt](http://...) + # Markdown inline - ![](http://... "image title") + # Markdown inline - ![alt](http://... "image title") + InlineUploads.match_md_inline_img(raw, external_src: true, &replace) + + raw + end + def self.matched_uploads(node) upload_path = Discourse.store.upload_path base_url = Discourse.base_url.sub(/https?:\/\//, "(https?://)") diff --git a/lib/cooked_post_processor.rb b/lib/cooked_post_processor.rb index 8444a4768bb..189c048dcf3 100644 --- a/lib/cooked_post_processor.rb +++ b/lib/cooked_post_processor.rb @@ -47,7 +47,6 @@ class CookedPostProcessor remove_user_ids update_post_image enforce_nofollow - pull_hotlinked_images grant_badges @post.link_post_uploads(fragments: @doc) DiscourseEvent.trigger(:post_process_cooked, @doc, @post) @@ -361,41 +360,6 @@ class CookedPostProcessor PrettyText.add_rel_attributes_to_user_content(@doc, add_nofollow) end - def pull_hotlinked_images - return if @opts[:skip_pull_hotlinked_images] - # have we enough disk space? - disable_if_low_on_disk_space # But still enqueue the job - # make sure no other job is scheduled - Jobs.cancel_scheduled_job(:pull_hotlinked_images, post_id: @post.id) - # schedule the job - delay = SiteSetting.editing_grace_period + 1 - Jobs.enqueue_in(delay.seconds.to_i, :pull_hotlinked_images, post_id: @post.id) - end - - def disable_if_low_on_disk_space - return if Discourse.store.external? - return if !SiteSetting.download_remote_images_to_local - return if available_disk_space >= SiteSetting.download_remote_images_threshold - - SiteSetting.download_remote_images_to_local = false - - # log the site setting change - reason = I18n.t("disable_remote_images_download_reason") - staff_action_logger = StaffActionLogger.new(Discourse.system_user) - staff_action_logger.log_site_setting_change("download_remote_images_to_local", true, false, details: reason) - - # also send a private message to the site contact user notify_about_low_disk_space - notify_about_low_disk_space - end - - def notify_about_low_disk_space - SystemMessage.create_from_system_user(Discourse.site_contact_user, :download_remote_images_disabled) - end - - def available_disk_space - 100 - DiskSpace.percent_free("#{Rails.root}/public/uploads") - end - private def post_process_images diff --git a/spec/jobs/process_post_spec.rb b/spec/jobs/process_post_spec.rb index fc20d8f5083..0fb49e684b9 100644 --- a/spec/jobs/process_post_spec.rb +++ b/spec/jobs/process_post_spec.rb @@ -88,4 +88,35 @@ describe Jobs::ProcessPost do expect(post.topic.reload.excerpt).to eq("Some OP content") end end + + context "#enqueue_pull_hotlinked_images" do + fab!(:post) { Fabricate(:post, created_at: 20.days.ago) } + let(:job) { Jobs::ProcessPost.new } + + it "runs even when download_remote_images_to_local is disabled" do + # We want to run it to pull hotlinked optimized images + SiteSetting.download_remote_images_to_local = false + expect_enqueued_with(job: :pull_hotlinked_images, args: { post_id: post.id }) do + job.execute({ post_id: post.id }) + end + end + + context "when download_remote_images_to_local? is enabled" do + before do + SiteSetting.download_remote_images_to_local = true + end + + it "enqueues" do + expect_enqueued_with(job: :pull_hotlinked_images, args: { post_id: post.id }) do + job.execute({ post_id: post.id }) + end + end + + it "does not run when requested to skip" do + job.execute({ post_id: post.id, skip_pull_hotlinked_images: true }) + expect(Jobs::PullHotlinkedImages.jobs.size).to eq(0) + end + end + + end end diff --git a/spec/jobs/pull_hotlinked_images_spec.rb b/spec/jobs/pull_hotlinked_images_spec.rb index 416d9a80ab4..c7be60aea6d 100644 --- a/spec/jobs/pull_hotlinked_images_spec.rb +++ b/spec/jobs/pull_hotlinked_images_spec.rb @@ -67,6 +67,21 @@ describe Jobs::PullHotlinkedImages do expect(post.reload.raw).to eq("") end + it 'enqueues raw replacement job with a delay' do + Jobs.run_later! + + post = Fabricate(:post, raw: "") + stub_image_size + + freeze_time + Jobs.expects(:cancel_scheduled_job).with(:update_hotlinked_raw, post_id: post.id).once + delay = SiteSetting.editing_grace_period + 1 + + expect_enqueued_with(job: :update_hotlinked_raw, args: { post_id: post.id }, at: Time.zone.now + delay.seconds) do + Jobs::PullHotlinkedImages.new.execute(post_id: post.id) + end + end + it 'removes downloaded images when they are no longer needed' do post = Fabricate(:post, raw: "") stub_image_size @@ -196,14 +211,14 @@ describe Jobs::PullHotlinkedImages do expect(post.uploads).to contain_exactly(upload) end - it "skips raw_html posts" do + it "skips editing raw for raw_html posts" do raw = "" post = Fabricate(:post, raw: raw, cook_method: Post.cook_methods[:raw_html]) stub_image_size expect do post.rebake! post.reload - end.not_to change { Upload.count } + end.to change { Upload.count }.by(1) expect(post.raw).to eq(raw) end @@ -556,6 +571,43 @@ describe Jobs::PullHotlinkedImages do end end + context "#disable_if_low_on_disk_space" do + fab!(:post) { Fabricate(:post, created_at: 20.days.ago) } + let(:job) { Jobs::PullHotlinkedImages.new } + + before do + SiteSetting.download_remote_images_to_local = true + SiteSetting.download_remote_images_threshold = 20 + job.stubs(:available_disk_space).returns(50) + end + + it "does nothing when there's enough disk space" do + SiteSetting.expects(:download_remote_images_to_local=).never + job.execute({ post_id: post.id }) + end + + context "when there's not enough disk space" do + + before { SiteSetting.download_remote_images_threshold = 75 } + + it "disables download_remote_images_threshold and send a notification to the admin" do + StaffActionLogger.any_instance.expects(:log_site_setting_change).once + SystemMessage.expects(:create_from_system_user).with(Discourse.site_contact_user, :download_remote_images_disabled).once + job.execute({ post_id: post.id }) + + expect(SiteSetting.download_remote_images_to_local).to eq(false) + end + + it "doesn't disable download_remote_images_to_local if site uses S3" do + setup_s3 + job.execute({ post_id: post.id }) + + expect(SiteSetting.download_remote_images_to_local).to eq(true) + end + + end + end + def stub_s3(upload) stub_upload(upload) stub_request(:get, "https:" + upload.url).to_return(status: 200, body: file_from_fixtures("smallest.png")) diff --git a/spec/lib/cooked_post_processor_spec.rb b/spec/lib/cooked_post_processor_spec.rb index 8f4a2b925b3..d626cc5f5c3 100644 --- a/spec/lib/cooked_post_processor_spec.rb +++ b/spec/lib/cooked_post_processor_spec.rb @@ -21,7 +21,6 @@ describe CookedPostProcessor do cpp.expects(:post_process_oneboxes).in_sequence(post_process) cpp.expects(:post_process_images).in_sequence(post_process) cpp.expects(:optimize_urls).in_sequence(post_process) - cpp.expects(:pull_hotlinked_images).in_sequence(post_process) cpp.post_process expect(PostUpload.exists?(post: post, upload: upload)).to eq(true) @@ -1543,102 +1542,6 @@ describe CookedPostProcessor do end end - context "#pull_hotlinked_images" do - - let(:post) { build(:post, created_at: 20.days.ago) } - let(:cpp) { CookedPostProcessor.new(post) } - - before { cpp.stubs(:available_disk_space).returns(90) } - - it "runs even when download_remote_images_to_local is disabled" do - # We want to run it to pull hotlinked optimized images - SiteSetting.download_remote_images_to_local = false - expect { cpp.pull_hotlinked_images }. - to change { Jobs::PullHotlinkedImages.jobs.count }.by 1 - end - - context "when download_remote_images_to_local? is enabled" do - before do - SiteSetting.download_remote_images_to_local = true - end - - it "disables download_remote_images if there is not enough disk space" do - cpp.expects(:available_disk_space).returns(5) - cpp.pull_hotlinked_images - expect(SiteSetting.download_remote_images_to_local).to eq(false) - end - - it "does not run when requested to skip" do - CookedPostProcessor.new(post, skip_pull_hotlinked_images: true).pull_hotlinked_images - expect(Jobs::PullHotlinkedImages.jobs.size).to eq(0) - end - - context "and there is enough disk space" do - before { cpp.expects(:disable_if_low_on_disk_space).at_least_once } - - context "and the post has been updated by an actual user" do - - before { post.id = 42 } - - it "ensures only one job is scheduled right after the editing_grace_period" do - freeze_time - - Jobs.expects(:cancel_scheduled_job).with(:pull_hotlinked_images, post_id: post.id).once - - delay = SiteSetting.editing_grace_period + 1 - - expect_enqueued_with(job: :pull_hotlinked_images, args: { post_id: post.id }, at: Time.zone.now + delay.seconds) do - cpp.pull_hotlinked_images - end - end - - end - - end - - end - - end - - context "#disable_if_low_on_disk_space" do - - let(:post) { build(:post, created_at: 20.days.ago) } - let(:cpp) { CookedPostProcessor.new(post) } - - before do - SiteSetting.download_remote_images_to_local = true - SiteSetting.download_remote_images_threshold = 20 - cpp.stubs(:available_disk_space).returns(50) - end - - it "does nothing when there's enough disk space" do - SiteSetting.expects(:download_remote_images_to_local=).never - cpp.disable_if_low_on_disk_space - end - - context "when there's not enough disk space" do - - before { SiteSetting.download_remote_images_threshold = 75 } - - it "disables download_remote_images_threshold and send a notification to the admin" do - StaffActionLogger.any_instance.expects(:log_site_setting_change).once - SystemMessage.expects(:create_from_system_user).with(Discourse.site_contact_user, :download_remote_images_disabled).once - cpp.disable_if_low_on_disk_space - - expect(SiteSetting.download_remote_images_to_local).to eq(false) - end - - it "doesn't disable download_remote_images_to_local if site uses S3" do - setup_s3 - cpp.disable_if_low_on_disk_space - - expect(SiteSetting.download_remote_images_to_local).to eq(true) - end - - end - - end - context "#is_a_hyperlink?" do let(:post) { build(:post) }