DEV: Allow site administrators to mark S3 uploads with a missing status (#27222)

This commit introduces the following changes which allows a site
administrator to mark `Upload` records with the `s3_file_missing`
verification status which will result in the `Upload` record being ignored when
`Discourse.store.list_missing_uploads` is ran on a site where S3 uploads
are enabled and `SiteSetting.enable_s3_inventory` is set to `true`.

1. Introduce `s3_file_missing` to `Upload.verification_statuses`
2. Introduce `Upload.mark_invalid_s3_uploads_as_missing` which updates
   `Upload#verification_status` of all `Upload` records from `invalid_etag` to `s3_file_missing`.
3. Introduce `rake uploads:mark_invalid_s3_uploads_as_missing` Rake task
   which allows a site administrator to change `Upload` records with
`invalid_etag` verification status to the `s3_file_missing`
verificaton_status.
4. Update `S3Inventory` to ignore `Upload` records with the
   `s3_file_missing` verification status.
This commit is contained in:
Alan Guo Xiang Tan
2024-05-30 08:37:38 +08:00
committed by GitHub
parent 2d1ab4c9e3
commit dc55b645b2
5 changed files with 125 additions and 44 deletions

View File

@ -4,7 +4,7 @@ require "aws-sdk-s3"
require "csv"
class S3Inventory
attr_reader :type, :model, :inventory_date
attr_reader :type, :inventory_date
CSV_KEY_INDEX = 1
CSV_ETAG_INDEX = 2
@ -25,9 +25,12 @@ class S3Inventory
if type == :upload
@type = "original"
@model = Upload
@scope = @model.by_users.without_s3_file_missing_confirmed_verification_status
elsif type == :optimized
@type = "optimized"
@model = OptimizedImage
@scope = @model = OptimizedImage
else
raise "Invalid type: #{type}"
end
end
@ -46,10 +49,10 @@ class S3Inventory
ActiveRecord::Base.transaction do
begin
connection.exec(
"CREATE TEMP TABLE #{table_name}(url text UNIQUE, etag text, PRIMARY KEY(etag, url))",
"CREATE TEMP TABLE #{tmp_table_name}(url text UNIQUE, etag text, PRIMARY KEY(etag, url))",
)
connection.copy_data("COPY #{table_name} FROM STDIN CSV") do
connection.copy_data("COPY #{tmp_table_name} FROM STDIN CSV") do
for_each_inventory_row do |row|
key = row[CSV_KEY_INDEX]
@ -61,66 +64,70 @@ class S3Inventory
end
end
table_name = @model.table_name
# backfilling etags
connection.async_exec(
"UPDATE #{model.table_name}
SET etag = #{table_name}.etag
FROM #{table_name}
WHERE #{model.table_name}.etag IS NULL AND
#{model.table_name}.url = #{table_name}.url",
"UPDATE #{table_name}
SET etag = #{tmp_table_name}.etag
FROM #{tmp_table_name}
WHERE #{table_name}.etag IS NULL AND
#{table_name}.url = #{tmp_table_name}.url",
)
uploads = model.where("updated_at < ?", inventory_date)
uploads = uploads.by_users if model == Upload
uploads = @scope.where("updated_at < ?", inventory_date)
missing_uploads =
uploads.joins(
"LEFT JOIN #{table_name} ON #{table_name}.etag = #{model.table_name}.etag",
).where("#{table_name}.etag IS NULL")
"LEFT JOIN #{tmp_table_name} ON #{tmp_table_name}.etag = #{table_name}.etag",
).where("#{tmp_table_name}.etag IS NULL")
exists_with_different_etag =
missing_uploads
.joins(
"LEFT JOIN #{table_name} inventory2 ON inventory2.url = #{model.table_name}.url",
"LEFT JOIN #{tmp_table_name} inventory2 ON inventory2.url = #{table_name}.url",
)
.where("inventory2.etag IS NOT NULL")
.pluck(:id)
# marking as verified/not verified
if model == Upload
if @model == Upload
sql_params = {
inventory_date: inventory_date,
invalid_etag: Upload.verification_statuses[:invalid_etag],
s3_file_missing_confirmed: Upload.verification_statuses[:s3_file_missing_confirmed],
verified: Upload.verification_statuses[:verified],
seeded_id_threshold: model::SEEDED_ID_THRESHOLD,
seeded_id_threshold: @model::SEEDED_ID_THRESHOLD,
}
DB.exec(<<~SQL, sql_params)
UPDATE #{model.table_name}
UPDATE #{table_name}
SET verification_status = :verified
WHERE etag IS NOT NULL
AND verification_status <> :verified
AND verification_status <> :s3_file_missing_confirmed
AND updated_at < :inventory_date
AND id > :seeded_id_threshold
AND EXISTS
(
SELECT 1
FROM #{table_name}
WHERE #{table_name}.etag = #{model.table_name}.etag
FROM #{tmp_table_name}
WHERE #{tmp_table_name}.etag = #{table_name}.etag
)
SQL
DB.exec(<<~SQL, sql_params)
UPDATE #{model.table_name}
UPDATE #{table_name}
SET verification_status = :invalid_etag
WHERE verification_status <> :invalid_etag
AND verification_status <> :s3_file_missing_confirmed
AND updated_at < :inventory_date
AND id > :seeded_id_threshold
AND NOT EXISTS
(
SELECT 1
FROM #{table_name}
WHERE #{table_name}.etag = #{model.table_name}.etag
FROM #{tmp_table_name}
WHERE #{tmp_table_name}.etag = #{table_name}.etag
)
SQL
end
@ -136,16 +143,16 @@ class S3Inventory
end
end
log "#{missing_count} of #{uploads.count} #{model.name.underscore.pluralize} are missing"
log "#{missing_count} of #{uploads.count} #{@scope.name.underscore.pluralize} are missing"
if exists_with_different_etag.present?
log "#{exists_with_different_etag.count} of these are caused by differing etags"
log "Null the etag column and re-run for automatic backfill"
end
end
Discourse.stats.set("missing_s3_#{model.table_name}", missing_count)
Discourse.stats.set("missing_s3_#{table_name}", missing_count)
ensure
connection.exec("DROP TABLE #{table_name}") unless connection.nil?
connection.exec("DROP TABLE #{tmp_table_name}") unless connection.nil?
end
end
ensure
@ -255,7 +262,7 @@ class S3Inventory
@connection ||= ActiveRecord::Base.connection.raw_connection
end
def table_name
def tmp_table_name
"#{type}_inventory"
end