From 1a70d118a806567ecf6e99f0abce3a24fb11155c Mon Sep 17 00:00:00 2001 From: Alan Guo Xiang Tan Date: Wed, 15 Jan 2025 14:52:49 +0800 Subject: [PATCH] FIX: `S3Inventory#backfill_etags_and_list_missing` need to unescape key (#30787) The `key` provided in the S3 inventory file will esacpe any special characters in the filename of the key so we need to unescape. Otherwise, uploads with extensions that conatins special characters will fail to match records which we insert into the temporary table based off the s3 inventory file. --- lib/s3_inventory.rb | 2 +- spec/fixtures/csv/s3_inventory.csv | 1 + spec/lib/s3_inventory_multisite_spec.rb | 2 +- spec/lib/s3_inventory_spec.rb | 43 +++++++++++++++++-------- 4 files changed, 33 insertions(+), 15 deletions(-) diff --git a/lib/s3_inventory.rb b/lib/s3_inventory.rb index 7057a978a12..eebf2e00090 100644 --- a/lib/s3_inventory.rb +++ b/lib/s3_inventory.rb @@ -65,7 +65,7 @@ class S3Inventory next if key.exclude?("#{type}/") url = File.join(Discourse.store.absolute_base_url, key) - connection.put_copy_data("#{url},#{row[CSV_ETAG_INDEX]}\n") + connection.put_copy_data("#{CGI.unescape(url)},#{row[CSV_ETAG_INDEX]}\n") end end diff --git a/spec/fixtures/csv/s3_inventory.csv b/spec/fixtures/csv/s3_inventory.csv index eb461b3ecc7..572d752e402 100644 --- a/spec/fixtures/csv/s3_inventory.csv +++ b/spec/fixtures/csv/s3_inventory.csv @@ -1,4 +1,5 @@ "abc","uploads/default/original/1X/0184537a4f419224404d013414e913a4f56018f2.jpg","defcaac0b4aca535c284e95f30d608d0" "abc","uploads/default/original/1X/050afc0ab01debe8cf48fd2ce50fbbf5eb072815.jpg","0cdc623af39cde0adb382670a6dc702a" "abc","uploads/default/original/1X/0789fbf5490babc68326b9cec90eeb0d6590db05.png","25c02eaceef4cb779fc17030d33f7f06" +"abc","uploads/default/original/1X/583bd7617044b269ae87a98684365d794e17b493.png%28test","25c02eaceef4cb779fc17030d33f7f51" "abc","uploads/second/original/1X/f789fbf5490babc68326b9cec90eeb0d6590db03.png","15c02eaceef4cb779fc17030d33f7f04" diff --git a/spec/lib/s3_inventory_multisite_spec.rb b/spec/lib/s3_inventory_multisite_spec.rb index 9fab4407274..6d0898000d6 100644 --- a/spec/lib/s3_inventory_multisite_spec.rb +++ b/spec/lib/s3_inventory_multisite_spec.rb @@ -22,7 +22,7 @@ RSpec.describe "S3Inventory", type: :multisite do db1 = files["default"].read db2 = files["second"].read - expect(db1.lines.count).to eq(3) + expect(db1.lines.count).to eq(4) expect(db2.lines.count).to eq(1) files.values.each do |f| diff --git a/spec/lib/s3_inventory_spec.rb b/spec/lib/s3_inventory_spec.rb index 44afba50932..db2fd449791 100644 --- a/spec/lib/s3_inventory_spec.rb +++ b/spec/lib/s3_inventory_spec.rb @@ -39,10 +39,15 @@ RSpec.describe S3Inventory do CSV.foreach(csv_filename, headers: false) do |row| next if row[S3Inventory::CSV_KEY_INDEX].exclude?("default") + Fabricate( :upload, etag: row[S3Inventory::CSV_ETAG_INDEX], - url: File.join(Discourse.store.absolute_base_url, row[S3Inventory::CSV_KEY_INDEX]), + url: + File.join( + Discourse.store.absolute_base_url, + CGI.unescape(row[S3Inventory::CSV_KEY_INDEX]), + ), updated_at: 2.days.ago, ) end @@ -66,37 +71,44 @@ RSpec.describe S3Inventory do it "should display missing uploads correctly" do output = capture_stdout { inventory.backfill_etags_and_list_missing } - expect(output).to eq("#{@upload_1.url}\n#{@no_etag.url}\n2 of 5 uploads are missing\n") + expect(output).to eq("#{@upload_1.url}\n#{@no_etag.url}\n2 of 6 uploads are missing\n") expect(Discourse.stats.get("missing_s3_uploads")).to eq(2) end it "should detect when a url match exists with a different etag" do - differing_etag = Upload.find_by(etag: "defcaac0b4aca535c284e95f30d608d0") - differing_etag.update_columns(etag: "somethingelse") + upload_with_differing_tag_1 = Upload.find_by(etag: "defcaac0b4aca535c284e95f30d608d0") + upload_with_differing_tag_1.update_columns(etag: "somethingelse") + + upload_with_differing_tag_2 = Upload.find_by(etag: "25c02eaceef4cb779fc17030d33f7f51") + upload_with_differing_tag_2.update_columns(etag: "somethingelse") output = capture_stdout { inventory.backfill_etags_and_list_missing } expect(output).to eq(<<~TEXT) - #{differing_etag.url} has different etag + #{upload_with_differing_tag_1.url} has different etag + #{upload_with_differing_tag_2.url} has different etag #{@upload_1.url} #{@no_etag.url} - 3 of 5 uploads are missing - 1 of these are caused by differing etags + 4 of 6 uploads are missing + 2 of these are caused by differing etags Null the etag column and re-run for automatic backfill TEXT - expect(Discourse.stats.get("missing_s3_uploads")).to eq(3) + + expect(Discourse.stats.get("missing_s3_uploads")).to eq(4) end it "marks missing uploads as not verified and found uploads as verified. uploads not checked will be verified nil" do expect( Upload.where(verification_status: Upload.verification_statuses[:unchecked]).count, - ).to eq(12) + ).to eq(13) + output = capture_stdout { inventory.backfill_etags_and_list_missing } verification_status = Upload.pluck(:verification_status) + expect( Upload.where(verification_status: Upload.verification_statuses[:verified]).count, - ).to eq(3) + ).to eq(4) expect(Upload.with_invalid_etag_verification_status.count).to eq(2) @@ -128,8 +140,13 @@ RSpec.describe S3Inventory do "#{Discourse.store.absolute_base_url}/uploads/default/original/1X/0789fbf5490babc68326b9cec90eeb0d6590db05.png", "25c02eaceef4cb779fc17030d33f7f06", ], + [ + "#{Discourse.store.absolute_base_url}/uploads/default/original/1X/583bd7617044b269ae87a98684365d794e17b493.png(test", + "25c02eaceef4cb779fc17030d33f7f51", + ], ] - files.each { |file| Fabricate(:upload, url: file[0]) } + + files.each { |file| Fabricate(:upload, url: file[0], etag: nil) } inventory.expects(:files).returns([{ key: "Key", filename: "#{csv_filename}.gz" }]).times(3) @@ -137,7 +154,7 @@ RSpec.describe S3Inventory do capture_stdout do expect { inventory.backfill_etags_and_list_missing }.to change { Upload.where(etag: nil).count - }.by(-2) + }.by(-3) end expect(Upload.by_users.order(:url).pluck(:url, :etag)).to eq(files) @@ -220,7 +237,7 @@ RSpec.describe S3Inventory do end end - expect(output).to eq("#{upload.url}\n#{no_etag.url}\n2 of 5 uploads are missing\n") + expect(output).to eq("#{upload.url}\n#{no_etag.url}\n2 of 6 uploads are missing\n") expect(Discourse.stats.get("missing_s3_uploads")).to eq(2) end end