mirror of
https://github.com/discourse/discourse.git
synced 2025-05-22 22:43:33 +08:00
Fixes for vBulletin bulk importer (#17618)
* Allow taking table prefix from env var * FIX: remove unused column references The columns `filedata` and `extension` are not present in a v4.2.4 database, and they aren't used in the method anyways. * FIX: report progress for tables without imported_id * FIX: effectively check for AR validation errors NOTE: other migration scripts also have this problem; see /t/58202 * FIX: properly count Posts when importing attachments * FIX: improve logging * Remove leftover comment * FIX: show progress when exporting Permalink file * PERF: stream Permalink file The current way results in tons of memory usage; write once per line instead * Document fixes needed * WIP - deduplicate category names * Ignore non alphanumeric chars for grouping * FIX: properly deduplicate user emails by merging accounts * FIX: don't merge empty UserEmails * Improve logging * Merge users AFTER fixing primary key sequences * Parallelize user merging * Save duplicated users structure for debugging purposes * Add progress logging for the (multiple hour) user merging step
This commit is contained in:

committed by
GitHub

parent
a3abbe07db
commit
bfecbde837
@ -99,6 +99,7 @@ class BulkImport::Base
|
||||
load_indexes
|
||||
execute
|
||||
fix_primary_keys
|
||||
execute_after
|
||||
puts "Done! Now run the 'import:ensure_consistency' rake task."
|
||||
end
|
||||
|
||||
@ -227,6 +228,9 @@ class BulkImport::Base
|
||||
raise NotImplementedError
|
||||
end
|
||||
|
||||
def execute_after
|
||||
end
|
||||
|
||||
def fix_primary_keys
|
||||
puts "Updating primary key sequences..."
|
||||
@raw_connection.exec("SELECT setval('#{Group.sequence_name}', #{@last_group_id})") if @last_group_id > 0
|
||||
@ -713,6 +717,7 @@ class BulkImport::Base
|
||||
imported_ids = []
|
||||
process_method_name = "process_#{name}"
|
||||
sql = "COPY #{name.pluralize} (#{columns.map { |c| "\"#{c}\"" }.join(",")}) FROM STDIN"
|
||||
rows_created = 0
|
||||
|
||||
@raw_connection.copy_data(sql, @encoder) do
|
||||
rows.each do |row|
|
||||
@ -722,7 +727,8 @@ class BulkImport::Base
|
||||
imported_ids << mapped[:imported_id] unless mapped[:imported_id].nil?
|
||||
imported_ids |= mapped[:imported_ids] unless mapped[:imported_ids].nil?
|
||||
@raw_connection.put_copy_data columns.map { |c| processed[c] } unless processed[:skip]
|
||||
print "\r%7d - %6d/sec" % [imported_ids.size, imported_ids.size.to_f / (Time.now - start)] if imported_ids.size % 5000 == 0
|
||||
rows_created += 1
|
||||
print "\r%7d - %6d/sec" % [rows_created, rows_created.to_f / (Time.now - start)] if rows_created % 100 == 0
|
||||
rescue => e
|
||||
puts "\n"
|
||||
puts "ERROR: #{e.message}"
|
||||
@ -731,10 +737,7 @@ class BulkImport::Base
|
||||
end
|
||||
end
|
||||
|
||||
if imported_ids.size > 0
|
||||
print "\r%7d - %6d/sec" % [imported_ids.size, imported_ids.size.to_f / (Time.now - start)]
|
||||
puts
|
||||
end
|
||||
print "\r%7d - %6d/sec\n" % [rows_created, rows_created.to_f / (Time.now - start)] if rows_created > 0
|
||||
|
||||
id_mapping_method_name = "#{name}_id_from_imported_id".freeze
|
||||
return unless respond_to?(id_mapping_method_name)
|
||||
@ -745,6 +748,7 @@ class BulkImport::Base
|
||||
}
|
||||
end
|
||||
rescue => e
|
||||
# FIXME: errors catched here stop the rest of the COPY
|
||||
puts e.message
|
||||
puts e.backtrace.join("\n")
|
||||
end
|
||||
|
Reference in New Issue
Block a user