Revert "FEATURE: Mark bad uploads with :invalid_url (#29640)" (#29657)

This reverts commit 5a00a041f1a9a00fb31b18956769262af6f11037.

Implementation is currently not correct. Multiple uploads can share the
same etag but have different paths in the S3 bucket.
This commit is contained in:
Alan Guo Xiang Tan
2024-11-08 13:04:52 +08:00
committed by GitHub
parent 4bc030f76f
commit af642d0d69
3 changed files with 7 additions and 67 deletions

View File

@ -66,25 +66,13 @@ class Upload < ActiveRecord::Base
scope :with_invalid_etag_verification_status, scope :with_invalid_etag_verification_status,
-> { where(verification_status: Upload.verification_statuses[:invalid_etag]) } -> { where(verification_status: Upload.verification_statuses[:invalid_etag]) }
scope :with_invalid_url_verification_status,
-> { where(verification_status: Upload.verification_statuses[:invalid_url]) }
def self.verification_statuses def self.verification_statuses
@verification_statuses ||= @verification_statuses ||=
Enum.new( Enum.new(
unchecked: 1, unchecked: 1,
verified: 2, verified: 2,
# Used by S3Inventory to mark S3 Upload records that have an invalid ETag value compared to invalid_etag: 3, # Used by S3Inventory to mark S3 Upload records that have an invalid ETag value compared to the ETag value of the inventory file
# the ETag value of the inventory file. A upload with invalid ETag is equivalent to "missing s3_file_missing_confirmed: 4, # Used by S3Inventory to skip S3 Upload records that are confirmed to not be backed by a file in the S3 file store
# upload file"
invalid_etag: 3,
# Used by S3Inventory to skip S3 Upload records that are confirmed to not be backed by a
# file in the S3 file store
s3_file_missing_confirmed: 4,
# Used by S3Inventory to mark S3 Upload records that have an invalid url value compared to
# the url value of the inventory file. A upload with invalid URL is equivalent to "file
# exists (same ETag), but with a different URL"
invalid_url: 5,
) )
end end

View File

@ -85,9 +85,7 @@ class S3Inventory
missing_uploads = missing_uploads =
uploads.joins( uploads.joins(
"LEFT JOIN #{tmp_table_name} ON #{tmp_table_name}.etag = #{table_name}.etag", "LEFT JOIN #{tmp_table_name} ON #{tmp_table_name}.etag = #{table_name}.etag",
).where( ).where("#{tmp_table_name}.etag IS NULL")
"#{tmp_table_name}.etag IS NULL OR #{tmp_table_name}.url != #{table_name}.url",
)
exists_with_different_etag = exists_with_different_etag =
missing_uploads missing_uploads
@ -97,20 +95,11 @@ class S3Inventory
.where("inventory2.etag IS NOT NULL") .where("inventory2.etag IS NOT NULL")
.pluck(:id) .pluck(:id)
exists_with_different_url =
missing_uploads
.joins(
"LEFT JOIN #{tmp_table_name} inventory3 ON inventory3.etag = #{table_name}.etag",
)
.where("inventory3.url != #{table_name}.url")
.pluck(:id)
# marking as verified/not verified # marking as verified/not verified
if @model == Upload if @model == Upload
sql_params = { sql_params = {
inventory_date: inventory_date, inventory_date: inventory_date,
invalid_etag: Upload.verification_statuses[:invalid_etag], invalid_etag: Upload.verification_statuses[:invalid_etag],
invalid_url: Upload.verification_statuses[:invalid_url],
s3_file_missing_confirmed: Upload.verification_statuses[:s3_file_missing_confirmed], s3_file_missing_confirmed: Upload.verification_statuses[:s3_file_missing_confirmed],
verified: Upload.verification_statuses[:verified], verified: Upload.verification_statuses[:verified],
seeded_id_threshold: @model::SEEDED_ID_THRESHOLD, seeded_id_threshold: @model::SEEDED_ID_THRESHOLD,
@ -146,22 +135,6 @@ class S3Inventory
WHERE #{tmp_table_name}.etag = #{table_name}.etag WHERE #{tmp_table_name}.etag = #{table_name}.etag
) )
SQL SQL
DB.exec(<<~SQL, sql_params)
UPDATE #{table_name}
SET verification_status = :invalid_url
WHERE verification_status <> :invalid_url
AND verification_status <> :invalid_etag
AND verification_status <> :s3_file_missing_confirmed
AND updated_at < :inventory_date
AND id > :seeded_id_threshold
AND NOT EXISTS
(
SELECT 1
FROM #{tmp_table_name}
WHERE #{tmp_table_name}.url = #{table_name}.url
)
SQL
end end
if (missing_count = missing_uploads.count) > 0 if (missing_count = missing_uploads.count) > 0
@ -170,8 +143,6 @@ class S3Inventory
.find_each do |upload| .find_each do |upload|
if exists_with_different_etag.include?(upload.id) if exists_with_different_etag.include?(upload.id)
log "#{upload.url} has different etag" log "#{upload.url} has different etag"
elsif exists_with_different_url.include?(upload.id)
log "#{upload.url} has different url"
else else
log upload.url log upload.url
end end
@ -182,10 +153,6 @@ class S3Inventory
log "#{exists_with_different_etag.count} of these are caused by differing etags" log "#{exists_with_different_etag.count} of these are caused by differing etags"
log "Null the etag column and re-run for automatic backfill" log "Null the etag column and re-run for automatic backfill"
end end
if exists_with_different_url.present?
log "#{exists_with_different_url.count} of these are caused by differing urls"
log "Empty the url column and re-run for automatic backfill"
end
end end
set_missing_s3_discourse_stats(missing_count) set_missing_s3_discourse_stats(missing_count)

View File

@ -74,29 +74,20 @@ RSpec.describe S3Inventory do
differing_etag = Upload.find_by(etag: "defcaac0b4aca535c284e95f30d608d0") differing_etag = Upload.find_by(etag: "defcaac0b4aca535c284e95f30d608d0")
differing_etag.update_columns(etag: "somethingelse") differing_etag.update_columns(etag: "somethingelse")
differing_url = Upload.find_by(etag: "0cdc623af39cde0adb382670a6dc702a")
differing_url.update_columns(url: differing_url.url.gsub("default", "notdefault"))
output = capture_stdout { inventory.backfill_etags_and_list_missing } output = capture_stdout { inventory.backfill_etags_and_list_missing }
expect(output).to eq(<<~TEXT) expect(output).to eq(<<~TEXT)
#{differing_etag.url} has different etag #{differing_etag.url} has different etag
#{differing_url.url} has different url
#{@upload_1.url} #{@upload_1.url}
#{@no_etag.url} #{@no_etag.url}
4 of 5 uploads are missing 3 of 5 uploads are missing
1 of these are caused by differing etags 1 of these are caused by differing etags
Null the etag column and re-run for automatic backfill Null the etag column and re-run for automatic backfill
1 of these are caused by differing urls
Empty the url column and re-run for automatic backfill
TEXT TEXT
expect(Discourse.stats.get("missing_s3_uploads")).to eq(4) expect(Discourse.stats.get("missing_s3_uploads")).to eq(3)
end end
it "marks missing uploads as not verified and found uploads as verified. uploads not checked will be verified nil" do it "marks missing uploads as not verified and found uploads as verified. uploads not checked will be verified nil" do
differing_url = Upload.find_by(etag: "0cdc623af39cde0adb382670a6dc702a")
differing_url.update_columns(url: differing_url.url.gsub("default", "notdefault"))
expect( expect(
Upload.where(verification_status: Upload.verification_statuses[:unchecked]).count, Upload.where(verification_status: Upload.verification_statuses[:unchecked]).count,
).to eq(12) ).to eq(12)
@ -105,10 +96,9 @@ RSpec.describe S3Inventory do
verification_status = Upload.pluck(:verification_status) verification_status = Upload.pluck(:verification_status)
expect( expect(
Upload.where(verification_status: Upload.verification_statuses[:verified]).count, Upload.where(verification_status: Upload.verification_statuses[:verified]).count,
).to eq(2) ).to eq(3)
expect(Upload.with_invalid_etag_verification_status.count).to eq(2) expect(Upload.with_invalid_etag_verification_status.count).to eq(2)
expect(Upload.with_invalid_url_verification_status.count).to eq(1)
expect( expect(
Upload.where(verification_status: Upload.verification_statuses[:unchecked]).count, Upload.where(verification_status: Upload.verification_statuses[:unchecked]).count,
@ -208,12 +198,7 @@ RSpec.describe S3Inventory do
CSV.foreach(csv_filename, headers: false) do |row| CSV.foreach(csv_filename, headers: false) do |row|
next if row[S3Inventory::CSV_KEY_INDEX].exclude?("default") next if row[S3Inventory::CSV_KEY_INDEX].exclude?("default")
Fabricate( Fabricate(:upload, etag: row[S3Inventory::CSV_ETAG_INDEX], updated_at: 2.days.ago)
:upload,
url: File.join(Discourse.store.absolute_base_url, row[S3Inventory::CSV_KEY_INDEX]),
etag: row[S3Inventory::CSV_ETAG_INDEX],
updated_at: 2.days.ago,
)
end end
upload = Fabricate(:upload, etag: "ETag", updated_at: 1.days.ago) upload = Fabricate(:upload, etag: "ETag", updated_at: 1.days.ago)