FIX: S3Inventory#backfill_etags_and_list_missing need to unescape key (#30787)

The `key` provided in the S3 inventory file will esacpe any special
characters in the filename of the key so we need to unescape. Otherwise,
uploads with extensions that conatins special characters will fail to
match records which we insert into the temporary table based off the
s3 inventory file.
This commit is contained in:
Alan Guo Xiang Tan
2025-01-15 14:52:49 +08:00
committed by GitHub
parent 061899fee4
commit 1a70d118a8
4 changed files with 33 additions and 15 deletions

View File

@ -65,7 +65,7 @@ class S3Inventory
next if key.exclude?("#{type}/") next if key.exclude?("#{type}/")
url = File.join(Discourse.store.absolute_base_url, key) url = File.join(Discourse.store.absolute_base_url, key)
connection.put_copy_data("#{url},#{row[CSV_ETAG_INDEX]}\n") connection.put_copy_data("#{CGI.unescape(url)},#{row[CSV_ETAG_INDEX]}\n")
end end
end end

View File

@ -1,4 +1,5 @@
"abc","uploads/default/original/1X/0184537a4f419224404d013414e913a4f56018f2.jpg","defcaac0b4aca535c284e95f30d608d0" "abc","uploads/default/original/1X/0184537a4f419224404d013414e913a4f56018f2.jpg","defcaac0b4aca535c284e95f30d608d0"
"abc","uploads/default/original/1X/050afc0ab01debe8cf48fd2ce50fbbf5eb072815.jpg","0cdc623af39cde0adb382670a6dc702a" "abc","uploads/default/original/1X/050afc0ab01debe8cf48fd2ce50fbbf5eb072815.jpg","0cdc623af39cde0adb382670a6dc702a"
"abc","uploads/default/original/1X/0789fbf5490babc68326b9cec90eeb0d6590db05.png","25c02eaceef4cb779fc17030d33f7f06" "abc","uploads/default/original/1X/0789fbf5490babc68326b9cec90eeb0d6590db05.png","25c02eaceef4cb779fc17030d33f7f06"
"abc","uploads/default/original/1X/583bd7617044b269ae87a98684365d794e17b493.png%28test","25c02eaceef4cb779fc17030d33f7f51"
"abc","uploads/second/original/1X/f789fbf5490babc68326b9cec90eeb0d6590db03.png","15c02eaceef4cb779fc17030d33f7f04" "abc","uploads/second/original/1X/f789fbf5490babc68326b9cec90eeb0d6590db03.png","15c02eaceef4cb779fc17030d33f7f04"

1 abc uploads/default/original/1X/0184537a4f419224404d013414e913a4f56018f2.jpg defcaac0b4aca535c284e95f30d608d0
2 abc uploads/default/original/1X/050afc0ab01debe8cf48fd2ce50fbbf5eb072815.jpg 0cdc623af39cde0adb382670a6dc702a
3 abc uploads/default/original/1X/0789fbf5490babc68326b9cec90eeb0d6590db05.png 25c02eaceef4cb779fc17030d33f7f06
4 abc uploads/default/original/1X/583bd7617044b269ae87a98684365d794e17b493.png%28test 25c02eaceef4cb779fc17030d33f7f51
5 abc uploads/second/original/1X/f789fbf5490babc68326b9cec90eeb0d6590db03.png 15c02eaceef4cb779fc17030d33f7f04

View File

@ -22,7 +22,7 @@ RSpec.describe "S3Inventory", type: :multisite do
db1 = files["default"].read db1 = files["default"].read
db2 = files["second"].read db2 = files["second"].read
expect(db1.lines.count).to eq(3) expect(db1.lines.count).to eq(4)
expect(db2.lines.count).to eq(1) expect(db2.lines.count).to eq(1)
files.values.each do |f| files.values.each do |f|

View File

@ -39,10 +39,15 @@ RSpec.describe S3Inventory do
CSV.foreach(csv_filename, headers: false) do |row| CSV.foreach(csv_filename, headers: false) do |row|
next if row[S3Inventory::CSV_KEY_INDEX].exclude?("default") next if row[S3Inventory::CSV_KEY_INDEX].exclude?("default")
Fabricate( Fabricate(
:upload, :upload,
etag: row[S3Inventory::CSV_ETAG_INDEX], etag: row[S3Inventory::CSV_ETAG_INDEX],
url: File.join(Discourse.store.absolute_base_url, row[S3Inventory::CSV_KEY_INDEX]), url:
File.join(
Discourse.store.absolute_base_url,
CGI.unescape(row[S3Inventory::CSV_KEY_INDEX]),
),
updated_at: 2.days.ago, updated_at: 2.days.ago,
) )
end end
@ -66,37 +71,44 @@ RSpec.describe S3Inventory do
it "should display missing uploads correctly" do it "should display missing uploads correctly" do
output = capture_stdout { inventory.backfill_etags_and_list_missing } output = capture_stdout { inventory.backfill_etags_and_list_missing }
expect(output).to eq("#{@upload_1.url}\n#{@no_etag.url}\n2 of 5 uploads are missing\n") expect(output).to eq("#{@upload_1.url}\n#{@no_etag.url}\n2 of 6 uploads are missing\n")
expect(Discourse.stats.get("missing_s3_uploads")).to eq(2) expect(Discourse.stats.get("missing_s3_uploads")).to eq(2)
end end
it "should detect when a url match exists with a different etag" do it "should detect when a url match exists with a different etag" do
differing_etag = Upload.find_by(etag: "defcaac0b4aca535c284e95f30d608d0") upload_with_differing_tag_1 = Upload.find_by(etag: "defcaac0b4aca535c284e95f30d608d0")
differing_etag.update_columns(etag: "somethingelse") upload_with_differing_tag_1.update_columns(etag: "somethingelse")
upload_with_differing_tag_2 = Upload.find_by(etag: "25c02eaceef4cb779fc17030d33f7f51")
upload_with_differing_tag_2.update_columns(etag: "somethingelse")
output = capture_stdout { inventory.backfill_etags_and_list_missing } output = capture_stdout { inventory.backfill_etags_and_list_missing }
expect(output).to eq(<<~TEXT) expect(output).to eq(<<~TEXT)
#{differing_etag.url} has different etag #{upload_with_differing_tag_1.url} has different etag
#{upload_with_differing_tag_2.url} has different etag
#{@upload_1.url} #{@upload_1.url}
#{@no_etag.url} #{@no_etag.url}
3 of 5 uploads are missing 4 of 6 uploads are missing
1 of these are caused by differing etags 2 of these are caused by differing etags
Null the etag column and re-run for automatic backfill Null the etag column and re-run for automatic backfill
TEXT TEXT
expect(Discourse.stats.get("missing_s3_uploads")).to eq(3)
expect(Discourse.stats.get("missing_s3_uploads")).to eq(4)
end end
it "marks missing uploads as not verified and found uploads as verified. uploads not checked will be verified nil" do it "marks missing uploads as not verified and found uploads as verified. uploads not checked will be verified nil" do
expect( expect(
Upload.where(verification_status: Upload.verification_statuses[:unchecked]).count, Upload.where(verification_status: Upload.verification_statuses[:unchecked]).count,
).to eq(12) ).to eq(13)
output = capture_stdout { inventory.backfill_etags_and_list_missing } output = capture_stdout { inventory.backfill_etags_and_list_missing }
verification_status = Upload.pluck(:verification_status) verification_status = Upload.pluck(:verification_status)
expect( expect(
Upload.where(verification_status: Upload.verification_statuses[:verified]).count, Upload.where(verification_status: Upload.verification_statuses[:verified]).count,
).to eq(3) ).to eq(4)
expect(Upload.with_invalid_etag_verification_status.count).to eq(2) expect(Upload.with_invalid_etag_verification_status.count).to eq(2)
@ -128,8 +140,13 @@ RSpec.describe S3Inventory do
"#{Discourse.store.absolute_base_url}/uploads/default/original/1X/0789fbf5490babc68326b9cec90eeb0d6590db05.png", "#{Discourse.store.absolute_base_url}/uploads/default/original/1X/0789fbf5490babc68326b9cec90eeb0d6590db05.png",
"25c02eaceef4cb779fc17030d33f7f06", "25c02eaceef4cb779fc17030d33f7f06",
], ],
[
"#{Discourse.store.absolute_base_url}/uploads/default/original/1X/583bd7617044b269ae87a98684365d794e17b493.png(test",
"25c02eaceef4cb779fc17030d33f7f51",
],
] ]
files.each { |file| Fabricate(:upload, url: file[0]) }
files.each { |file| Fabricate(:upload, url: file[0], etag: nil) }
inventory.expects(:files).returns([{ key: "Key", filename: "#{csv_filename}.gz" }]).times(3) inventory.expects(:files).returns([{ key: "Key", filename: "#{csv_filename}.gz" }]).times(3)
@ -137,7 +154,7 @@ RSpec.describe S3Inventory do
capture_stdout do capture_stdout do
expect { inventory.backfill_etags_and_list_missing }.to change { expect { inventory.backfill_etags_and_list_missing }.to change {
Upload.where(etag: nil).count Upload.where(etag: nil).count
}.by(-2) }.by(-3)
end end
expect(Upload.by_users.order(:url).pluck(:url, :etag)).to eq(files) expect(Upload.by_users.order(:url).pluck(:url, :etag)).to eq(files)
@ -220,7 +237,7 @@ RSpec.describe S3Inventory do
end end
end end
expect(output).to eq("#{upload.url}\n#{no_etag.url}\n2 of 5 uploads are missing\n") expect(output).to eq("#{upload.url}\n#{no_etag.url}\n2 of 6 uploads are missing\n")
expect(Discourse.stats.get("missing_s3_uploads")).to eq(2) expect(Discourse.stats.get("missing_s3_uploads")).to eq(2)
end end
end end