From af642d0d6985a39d1659f8c28dd8d7f605aedf1d Mon Sep 17 00:00:00 2001 From: Alan Guo Xiang Tan Date: Fri, 8 Nov 2024 13:04:52 +0800 Subject: [PATCH] Revert "FEATURE: Mark bad uploads with :invalid_url (#29640)" (#29657) This reverts commit 5a00a041f1a9a00fb31b18956769262af6f11037. Implementation is currently not correct. Multiple uploads can share the same etag but have different paths in the S3 bucket. --- app/models/upload.rb | 16 ++-------------- lib/s3_inventory.rb | 35 +---------------------------------- spec/lib/s3_inventory_spec.rb | 23 ++++------------------- 3 files changed, 7 insertions(+), 67 deletions(-) diff --git a/app/models/upload.rb b/app/models/upload.rb index 5c1ffd029fe..02df52ab18b 100644 --- a/app/models/upload.rb +++ b/app/models/upload.rb @@ -66,25 +66,13 @@ class Upload < ActiveRecord::Base scope :with_invalid_etag_verification_status, -> { where(verification_status: Upload.verification_statuses[:invalid_etag]) } - scope :with_invalid_url_verification_status, - -> { where(verification_status: Upload.verification_statuses[:invalid_url]) } - def self.verification_statuses @verification_statuses ||= Enum.new( unchecked: 1, verified: 2, - # Used by S3Inventory to mark S3 Upload records that have an invalid ETag value compared to - # the ETag value of the inventory file. A upload with invalid ETag is equivalent to "missing - # upload file" - invalid_etag: 3, - # Used by S3Inventory to skip S3 Upload records that are confirmed to not be backed by a - # file in the S3 file store - s3_file_missing_confirmed: 4, - # Used by S3Inventory to mark S3 Upload records that have an invalid url value compared to - # the url value of the inventory file. A upload with invalid URL is equivalent to "file - # exists (same ETag), but with a different URL" - invalid_url: 5, + invalid_etag: 3, # Used by S3Inventory to mark S3 Upload records that have an invalid ETag value compared to the ETag value of the inventory file + s3_file_missing_confirmed: 4, # Used by S3Inventory to skip S3 Upload records that are confirmed to not be backed by a file in the S3 file store ) end diff --git a/lib/s3_inventory.rb b/lib/s3_inventory.rb index 0e1662a643e..7057a978a12 100644 --- a/lib/s3_inventory.rb +++ b/lib/s3_inventory.rb @@ -85,9 +85,7 @@ class S3Inventory missing_uploads = uploads.joins( "LEFT JOIN #{tmp_table_name} ON #{tmp_table_name}.etag = #{table_name}.etag", - ).where( - "#{tmp_table_name}.etag IS NULL OR #{tmp_table_name}.url != #{table_name}.url", - ) + ).where("#{tmp_table_name}.etag IS NULL") exists_with_different_etag = missing_uploads @@ -97,20 +95,11 @@ class S3Inventory .where("inventory2.etag IS NOT NULL") .pluck(:id) - exists_with_different_url = - missing_uploads - .joins( - "LEFT JOIN #{tmp_table_name} inventory3 ON inventory3.etag = #{table_name}.etag", - ) - .where("inventory3.url != #{table_name}.url") - .pluck(:id) - # marking as verified/not verified if @model == Upload sql_params = { inventory_date: inventory_date, invalid_etag: Upload.verification_statuses[:invalid_etag], - invalid_url: Upload.verification_statuses[:invalid_url], s3_file_missing_confirmed: Upload.verification_statuses[:s3_file_missing_confirmed], verified: Upload.verification_statuses[:verified], seeded_id_threshold: @model::SEEDED_ID_THRESHOLD, @@ -146,22 +135,6 @@ class S3Inventory WHERE #{tmp_table_name}.etag = #{table_name}.etag ) SQL - - DB.exec(<<~SQL, sql_params) - UPDATE #{table_name} - SET verification_status = :invalid_url - WHERE verification_status <> :invalid_url - AND verification_status <> :invalid_etag - AND verification_status <> :s3_file_missing_confirmed - AND updated_at < :inventory_date - AND id > :seeded_id_threshold - AND NOT EXISTS - ( - SELECT 1 - FROM #{tmp_table_name} - WHERE #{tmp_table_name}.url = #{table_name}.url - ) - SQL end if (missing_count = missing_uploads.count) > 0 @@ -170,8 +143,6 @@ class S3Inventory .find_each do |upload| if exists_with_different_etag.include?(upload.id) log "#{upload.url} has different etag" - elsif exists_with_different_url.include?(upload.id) - log "#{upload.url} has different url" else log upload.url end @@ -182,10 +153,6 @@ class S3Inventory log "#{exists_with_different_etag.count} of these are caused by differing etags" log "Null the etag column and re-run for automatic backfill" end - if exists_with_different_url.present? - log "#{exists_with_different_url.count} of these are caused by differing urls" - log "Empty the url column and re-run for automatic backfill" - end end set_missing_s3_discourse_stats(missing_count) diff --git a/spec/lib/s3_inventory_spec.rb b/spec/lib/s3_inventory_spec.rb index 8ea55c28057..44afba50932 100644 --- a/spec/lib/s3_inventory_spec.rb +++ b/spec/lib/s3_inventory_spec.rb @@ -74,29 +74,20 @@ RSpec.describe S3Inventory do differing_etag = Upload.find_by(etag: "defcaac0b4aca535c284e95f30d608d0") differing_etag.update_columns(etag: "somethingelse") - differing_url = Upload.find_by(etag: "0cdc623af39cde0adb382670a6dc702a") - differing_url.update_columns(url: differing_url.url.gsub("default", "notdefault")) - output = capture_stdout { inventory.backfill_etags_and_list_missing } expect(output).to eq(<<~TEXT) #{differing_etag.url} has different etag - #{differing_url.url} has different url #{@upload_1.url} #{@no_etag.url} - 4 of 5 uploads are missing + 3 of 5 uploads are missing 1 of these are caused by differing etags Null the etag column and re-run for automatic backfill - 1 of these are caused by differing urls - Empty the url column and re-run for automatic backfill TEXT - expect(Discourse.stats.get("missing_s3_uploads")).to eq(4) + expect(Discourse.stats.get("missing_s3_uploads")).to eq(3) end it "marks missing uploads as not verified and found uploads as verified. uploads not checked will be verified nil" do - differing_url = Upload.find_by(etag: "0cdc623af39cde0adb382670a6dc702a") - differing_url.update_columns(url: differing_url.url.gsub("default", "notdefault")) - expect( Upload.where(verification_status: Upload.verification_statuses[:unchecked]).count, ).to eq(12) @@ -105,10 +96,9 @@ RSpec.describe S3Inventory do verification_status = Upload.pluck(:verification_status) expect( Upload.where(verification_status: Upload.verification_statuses[:verified]).count, - ).to eq(2) + ).to eq(3) expect(Upload.with_invalid_etag_verification_status.count).to eq(2) - expect(Upload.with_invalid_url_verification_status.count).to eq(1) expect( Upload.where(verification_status: Upload.verification_statuses[:unchecked]).count, @@ -208,12 +198,7 @@ RSpec.describe S3Inventory do CSV.foreach(csv_filename, headers: false) do |row| next if row[S3Inventory::CSV_KEY_INDEX].exclude?("default") - Fabricate( - :upload, - url: File.join(Discourse.store.absolute_base_url, row[S3Inventory::CSV_KEY_INDEX]), - etag: row[S3Inventory::CSV_ETAG_INDEX], - updated_at: 2.days.ago, - ) + Fabricate(:upload, etag: row[S3Inventory::CSV_ETAG_INDEX], updated_at: 2.days.ago) end upload = Fabricate(:upload, etag: "ETag", updated_at: 1.days.ago)