From f6e87e1e5ebdd6b6cfafa9e23cdd0a29190d1a7c Mon Sep 17 00:00:00 2001 From: Justin DiRose Date: Fri, 8 Jan 2021 09:31:39 -0600 Subject: [PATCH] DEV: Improvements to Discourse Merger script (#11660) After running the Discourse merge script, it was pretty evident it held up well after all these years ;) Made a few fixes: Included an environment variable for DB_PASS as likely the password will need to be changed if running the import in an official Docker container (recommended) Set a hard order for imported categories, otherwise sometimes they'd be imported in a weird order making things unpredictable for parent/child category imports Fixed a couple of instances where we added unique indexes (such as on category slugs) Set up upload regex to handle AWS URLs better Fixed the script to work with frozen string literals --- script/bulk_import/discourse_merger.rb | 50 ++++++++++++++++++-------- 1 file changed, 36 insertions(+), 14 deletions(-) diff --git a/script/bulk_import/discourse_merger.rb b/script/bulk_import/discourse_merger.rb index 6bc189a6648..da2a9c6a114 100644 --- a/script/bulk_import/discourse_merger.rb +++ b/script/bulk_import/discourse_merger.rb @@ -9,6 +9,7 @@ class BulkImport::DiscourseMerger < BulkImport::Base # DB_NAME: name of database being merged into the current local db # DB_HOST: hostname of database being merged + # DB_PASS: password used to access the Discourse database by the postgres user # UPLOADS_PATH: absolute path of the directory containing "original" # and "optimized" dirs. e.g. /home/discourse/other-site/public/uploads/default # SOURCE_BASE_URL: base url of the site being merged. e.g. https://meta.discourse.org @@ -16,12 +17,15 @@ class BulkImport::DiscourseMerger < BulkImport::Base # e.g. https://discourse-cdn-sjc1.com/business4 def initialize + db_password = ENV["DB_PASS"] || 'import_password' local_db = ActiveRecord::Base.connection_config - @raw_connection = PG.connect(dbname: local_db[:database], host: 'localhost', port: local_db[:port]) + @raw_connection = PG.connect(dbname: local_db[:database], host: 'localhost', port: local_db[:port], user: 'postgres', password: db_password) @source_db_config = { dbname: ENV["DB_NAME"] || 'dd_demo', - host: ENV["DB_HOST"] || 'localhost' + host: ENV["DB_HOST"] || 'localhost', + user: 'postgres', + password: db_password } raise "SOURCE_BASE_URL missing!" unless ENV['SOURCE_BASE_URL'] @@ -109,7 +113,7 @@ class BulkImport::DiscourseMerger < BulkImport::Base old_user_id = row['id']&.to_i if existing = UserEmail.where(email: row.delete('email')).first&.user # Merge these users - @users[old_user_id.to_s] = existing.id + @users[old_user_id] = existing.id @merged_user_ids << old_user_id next else @@ -122,7 +126,7 @@ class BulkImport::DiscourseMerger < BulkImport::Base end row['id'] = (@last_user_id += 1) - @users[old_user_id.to_s] = row['id'] + @users[old_user_id] = row['id'] @raw_connection.put_copy_data row.values end @@ -189,17 +193,30 @@ class BulkImport::DiscourseMerger < BulkImport::Base c1.slug AS slug FROM categories c1 LEFT OUTER JOIN categories c2 ON c1.parent_category_id = c2.id - ) x ON c.id = x.id" + ) x ON c.id = x.id + ORDER BY c.id" ).each do |row| + # using ORDER BY id to import categories in order of creation. + # this assumes parent categories were created prior to child categories + # and have a lower category id. + # + # without this definition, categories import in different orders in subsequent imports + # and can potentially mess up parent/child structure + source_category_path = row.delete('path')&.squeeze('/') existing = Category.where(slug: row['slug']).first parent_slug = existing&.parent_category&.slug if existing && source_category_path == "/c/#{parent_slug}/#{existing.slug}".squeeze('/') - @categories[row['id']] = existing.id + @categories[row['id'].to_i] = existing.id next + elsif existing + # if not the exact path as the source, + # we still need to avoid a unique index conflict on the slug when importing + # if that's the case, we'll append the imported id + row['slug'] = "#{row['slug']}-#{row['id']}" end old_user_id = row['user_id'].to_i @@ -211,10 +228,10 @@ class BulkImport::DiscourseMerger < BulkImport::Base row['parent_category_id'] = category_id_from_imported_id(row['parent_category_id']) end - old_id = row['id'] + old_id = row['id'].to_i row['id'] = (last_id += 1) imported_ids << old_id - @categories[old_id.to_s] = row['id'] + @categories[old_id] = row['id'] @raw_connection.put_copy_data(row.values) end @@ -234,8 +251,8 @@ class BulkImport::DiscourseMerger < BulkImport::Base puts 'updating category description topic ids...' @categories.each do |old_id, new_id| - category = Category.find(new_id) - if description_topic_id = topic_id_from_imported_id(category.topic_id) + category = Category.find(new_id) if new_id.present? + if description_topic_id = topic_id_from_imported_id(category&.topic_id) category.topic_id = description_topic_id category.save! end @@ -321,8 +338,13 @@ class BulkImport::DiscourseMerger < BulkImport::Base next if Upload.where(sha1: row['sha1']).exists? + # make sure to get a backup with uploads then convert them to local. + # when the backup is restored to a site with s3 uploads, it will upload the items + # to the bucket rel_filename = row['url'].gsub(/^\/uploads\/[^\/]+\//, '') - rel_filename = rel_filename.gsub(/^\/\/[^\/]+\.amazonaws\.com\//, '') + # assumes if coming from amazonaws.com that we want to remove everything + # but the text after the last `/`, which should leave us the filename + rel_filename = rel_filename.gsub(/^\/\/[^\/]+\.amazonaws\.com\/\S+\//, '') absolute_filename = File.join(@uploads_path, rel_filename) old_id = row['id'] @@ -457,11 +479,11 @@ class BulkImport::DiscourseMerger < BulkImport::Base row['deleted_by_id'] = user_id_from_imported_id(row['deleted_by_id']) if row['deleted_by_id'] row['badge_id'] = badge_id_from_imported_id(row['badge_id']) if row['badge_id'] - old_id = row['id'] + old_id = row['id'].to_i if old_id && last_id row['id'] = (last_id += 1) imported_ids << old_id if has_custom_fields - mapping[old_id.to_s] = row['id'] if mapping + mapping[old_id] = row['id'] if mapping end if skip_processing @@ -724,7 +746,7 @@ class BulkImport::DiscourseMerger < BulkImport::Base User.where('id >= ?', @first_new_user_id).find_each do |u| arr = [] - sql = "UPDATE users SET" + sql = "UPDATE users SET".dup if new_approved_by_id = user_id_from_imported_id(u.approved_by_id) arr << " approved_by_id = #{new_approved_by_id}"