From b00554d66285b3b2bf10f6c947639d9ae447eca7 Mon Sep 17 00:00:00 2001 From: Sam Date: Tue, 14 Jul 2015 15:53:38 +1000 Subject: [PATCH] initial lithuim importer (work in progress) --- script/import_scripts/lithium.rb | 592 ++++++++++++++++++ .../support/convert_mysql_xml_to_mysql.rb | 80 +++ 2 files changed, 672 insertions(+) create mode 100644 script/import_scripts/lithium.rb create mode 100644 script/import_scripts/support/convert_mysql_xml_to_mysql.rb diff --git a/script/import_scripts/lithium.rb b/script/import_scripts/lithium.rb new file mode 100644 index 00000000000..e80221fd38f --- /dev/null +++ b/script/import_scripts/lithium.rb @@ -0,0 +1,592 @@ +# Notes: +# +# Written by Sam +# +# Lithium are quite protective of data, there is no simple way of exporting +# If you have leverage you may get a data dump, in my case it was provided in XML +# format +# +# First step is to convert it to db format so you can import it into a DB +# that was done using import_scripts/support/convert_mysql_xml_to_mysql.rb +# + + + +require 'mysql2' +require 'reverse_markdown' +require File.expand_path(File.dirname(__FILE__) + "/base.rb") +require 'htmlentities' + +class ImportScripts::Lithium < ImportScripts::Base + BATCH_SIZE = 1000 + + # CHANGE THESE BEFORE RUNNING THE IMPORTER + DATABASE = "wd" + PASSWORD = "password" + TIMEZONE = "Asia/Kolkata" + ATTACHMENT_DIR = '/path/to/your/attachment/folder' + + def initialize + super + + @old_username_to_new_usernames = {} + + @tz = TZInfo::Timezone.get(TIMEZONE) + + @htmlentities = HTMLEntities.new + + @client = Mysql2::Client.new( + host: "localhost", + username: "root", + password: PASSWORD, + database: DATABASE + ) + end + + def execute + # import_groups + import_users + import_categories + # import_topics + # import_posts + # import_attachments + # + # close_topics + # post_process_posts + end + + def import_groups + puts "", "importing groups..." + + groups = mysql_query <<-SQL + SELECT usergroupid, title + FROM usergroup + ORDER BY usergroupid + SQL + + create_groups(groups) do |group| + { + id: group["usergroupid"], + name: @htmlentities.decode(group["title"]).strip + } + end + end + + def import_users + puts "", "importing users" + + user_count = mysql_query("SELECT COUNT(*) count FROM users").first["count"] + + batches(BATCH_SIZE) do |offset| + users = mysql_query <<-SQL + SELECT id, nlogin, login_canon, email, registration_time + FROM users + ORDER BY id + LIMIT #{BATCH_SIZE} + OFFSET #{offset} + SQL + + break if users.size < 1 + + create_users(users, total: user_count, offset: offset) do |user| + + { + id: user["id"], + name: user["nlogin"], + username: user["login-canon"], + email: user["email"].presence || fake_email, + # website: user["homepage"].strip, + # title: @htmlentities.decode(user["usertitle"]).strip, + # primary_group_id: group_id_from_imported_group_id(user["usergroupid"]), + created_at: unix_time(user["registration_time"]), + # post_create_action: proc do |u| + # @old_username_to_new_usernames[user["username"]] = u.username + # import_profile_picture(user, u) + # import_profile_background(user, u) + # end + } + end + end + end + + def unix_time(t) + Time.at(t/1000.0) + end + + def import_profile_picture(old_user, imported_user) + query = mysql_query <<-SQL + SELECT filedata, filename + FROM customavatar + WHERE userid = #{old_user["userid"]} + ORDER BY dateline DESC + LIMIT 1 + SQL + + picture = query.first + + return if picture.nil? + + file = Tempfile.new("profile-picture") + file.write(picture["filedata"].encode("ASCII-8BIT").force_encoding("UTF-8")) + file.rewind + + upload = Upload.create_for(imported_user.id, file, picture["filename"], file.size) + + return if !upload.persisted? + + imported_user.create_user_avatar + imported_user.user_avatar.update(custom_upload_id: upload.id) + imported_user.update(uploaded_avatar_id: upload.id) + ensure + file.close rescue nil + file.unlind rescue nil + end + + def import_profile_background(old_user, imported_user) + query = mysql_query <<-SQL + SELECT filedata, filename + FROM customprofilepic + WHERE userid = #{old_user["userid"]} + ORDER BY dateline DESC + LIMIT 1 + SQL + + background = query.first + + return if background.nil? + + file = Tempfile.new("profile-background") + file.write(background["filedata"].encode("ASCII-8BIT").force_encoding("UTF-8")) + file.rewind + + upload = Upload.create_for(imported_user.id, file, background["filename"], file.size) + + return if !upload.persisted? + + imported_user.user_profile.update(profile_background: upload.url) + ensure + file.close rescue nil + file.unlink rescue nil + end + + def import_categories + puts "", "importing top level categories..." + + categories = mysql_query("SELECT node_id, display_id, position, parent_node_id from nodes").to_a + + # HACK + top_level_categories = categories.select { |c| c["parent_node_id"] == 2 } + + create_categories(top_level_categories) do |category| + { + id: category["node_id"], + name: category["display_id"], + position: category["position"] + # description: + } + end + + + puts "", "importing children categories..." + + children_categories = categories.select { |c| ![1,2].include?(c["parent_node_id"]) && ![1,2].include?(c["node_id"]) } + + top_level_category_ids = Set.new(top_level_categories.map { |c| c["node_id"] }) + + # cut down the tree to only 2 levels of categories + children_categories.each do |cc| + while !top_level_category_ids.include?(cc["parent_node_id"]) + cc["parent_node_id"] = categories.detect { |c| c["node_id"] == cc["parent_node_id"] }["parent_node_id"] + end + end + + create_categories(children_categories) do |category| + { + id: category["node_id"], + name: category["display_id"], + position: category["position"], + # description: , + parent_category_id: category_id_from_imported_category_id(category["parent_node_id"]) + } + end + end + + def import_topics + puts "", "importing topics..." + + # # keep track of closed topics + # @closed_topic_ids = [] + + topic_count = mysql_query("SELECT COUNT(*) count FROM message2 where id = root_id").first["count"] + + batches(BATCH_SIZE) do |offset| + topics = mysql_query <<-SQL + SELECT id, subject, body, deleted, user_id, + post_date, views, node_id + FROM message2 + WHERE id = root_id + ORDER BY id + LIMIT #{BATCH_SIZE} + OFFSET #{offset} + SQL + + break if topics.size < 1 + + create_posts(topics, total: topic_count, offset: offset) do |topic| + + # @closed_topic_ids << topic_id if topic["open"] == "0" + + raw = ReverseMarkdown.convert(topic["body"]) + + t = { + id: "#{topic["node_id"]} #{topic["id"]}", + user_id: user_id_from_imported_user_id(topic["user_id"]) || Discourse::SYSTEM_USER_ID, + title: @htmlentities.decode(topic["subject"]).strip[0...255], + category: category_id_from_imported_category_id(topic["node_id"]), + raw: raw, + created_at: unix_time(topic["post_date"]), + views: topic["views"], + } + + if topic["deleted"] > 0 + t["deleted_at"] = Time.now + end + + t + end + end + end + + def import_posts + puts "", "importing posts..." + + post_count = mysql_query("SELECT COUNT(*) count FROM message2 + WHERE id <> root_id").first["count"] + + batches(BATCH_SIZE) do |offset| + posts = mysql_query <<-SQL + SELECT id, body, deleted, user_id, + post_date, parent_id, root_id + FROM message2 + WHERE id <> root_id + ORDER BY root_id, id + LIMIT #{BATCH_SIZE} + OFFSET #{offset} + SQL + + break if posts.size < 1 + + create_posts(posts, total: post_count, offset: offset) do |post| + raw = preprocess_post_raw(post["raw"]) rescue nil + next unless topic = topic_lookup_from_imported_post_id(post["root_id"]) + + raw = ReverseMarkdown.convert(post["body"]) + + new_post = { + id: "#{post["node_id"]} #{post["root_id"]} #{post["id"]}", + user_id: user_id_from_imported_user_id(post["user_id"]) || Discourse::SYSTEM_USER_ID, + topic_id: topic[:topic_id], + raw: raw, + created_at: unix_time(post["post_date"]), + } + + if post["deleted"] > 0 + new_post["deleted_at"] = Time.now + end + + if parent = topic_lookup_from_imported_post_id("#{post["node_id"]} #{post["root_id"]} #{post["parent_id"]}") + new_post[:reply_to_post_number] = parent[:post_number] + end + + new_post + end + end + end + + # find the uploaded file information from the db + def find_upload(post, attachment_id) + sql = "SELECT a.attachmentid attachment_id, a.userid user_id, a.filedataid file_id, a.filename filename, + a.caption caption + FROM attachment a + WHERE a.attachmentid = #{attachment_id}" + results = mysql_query(sql) + + unless (row = results.first) + puts "Couldn't find attachment record for post.id = #{post.id}, import_id = #{post.custom_fields['import_id']}" + return nil + end + + filename = File.join(ATTACHMENT_DIR, row['user_id'].to_s.split('').join('/'), "#{row['file_id']}.attach") + unless File.exists?(filename) + puts "Attachment file doesn't exist: #{filename}" + return nil + end + real_filename = row['filename'] + real_filename.prepend SecureRandom.hex if real_filename[0] == '.' + upload = create_upload(post.user.id, filename, real_filename) + + if upload.nil? || !upload.valid? + puts "Upload not valid :(" + puts upload.errors.inspect if upload + return nil + end + + return upload, real_filename + rescue Mysql2::Error => e + puts "SQL Error" + puts e.message + puts sql + return nil + end + + def import_attachments + puts '', 'importing attachments...' + + current_count = 0 + total_count = mysql_query("SELECT COUNT(postid) count FROM post WHERE postid NOT IN (SELECT firstpostid FROM thread)").first["count"] + + success_count = 0 + fail_count = 0 + + attachment_regex = /\[attach[^\]]*\](\d+)\[\/attach\]/i + + Post.find_each do |post| + current_count += 1 + print_status current_count, total_count + + new_raw = post.raw.dup + new_raw.gsub!(attachment_regex) do |s| + matches = attachment_regex.match(s) + attachment_id = matches[1] + + upload, filename = find_upload(post, attachment_id) + unless upload + fail_count += 1 + next + end + + html_for_upload(upload, filename) + end + + if new_raw != post.raw + PostRevisor.new(post).revise!(post.user, { raw: new_raw }, { bypass_bump: true, edit_reason: 'Import attachments from vBulletin' }) + end + + success_count += 1 + end + end + + def close_topics + puts "", "Closing topics..." + + sql = <<-SQL + WITH closed_topic_ids AS ( + SELECT t.id AS topic_id + FROM post_custom_fields pcf + JOIN posts p ON p.id = pcf.post_id + JOIN topics t ON t.id = p.topic_id + WHERE pcf.name = 'import_id' + AND pcf.value IN (?) + ) + UPDATE topics + SET closed = true + WHERE id IN (SELECT topic_id FROM closed_topic_ids) + SQL + + Topic.exec_sql(sql, @closed_topic_ids) + end + + def post_process_posts + puts "", "Postprocessing posts..." + + current = 0 + max = Post.count + + Post.find_each do |post| + begin + new_raw = postprocess_post_raw(post.raw) + if new_raw != post.raw + post.raw = new_raw + post.save + end + rescue PrettyText::JavaScriptError + nil + ensure + print_status(current += 1, max) + end + end + end + + def preprocess_post_raw(raw) + return "" if raw.blank? + + # decode HTML entities + raw = @htmlentities.decode(raw) + + # fix whitespaces + raw = raw.gsub(/(\\r)?\\n/, "\n") + .gsub("\\t", "\t") + + # [HTML]...[/HTML] + raw = raw.gsub(/\[html\]/i, "\n```html\n") + .gsub(/\[\/html\]/i, "\n```\n") + + # [PHP]...[/PHP] + raw = raw.gsub(/\[php\]/i, "\n```php\n") + .gsub(/\[\/php\]/i, "\n```\n") + + # [HIGHLIGHT="..."] + raw = raw.gsub(/\[highlight="?(\w+)"?\]/i) { "\n```#{$1.downcase}\n" } + + # [CODE]...[/CODE] + # [HIGHLIGHT]...[/HIGHLIGHT] + raw = raw.gsub(/\[\/?code\]/i, "\n```\n") + .gsub(/\[\/?highlight\]/i, "\n```\n") + + # [SAMP]...[/SAMP] + raw = raw.gsub(/\[\/?samp\]/i, "`") + + # replace all chevrons with HTML entities + # NOTE: must be done + # - AFTER all the "code" processing + # - BEFORE the "quote" processing + raw = raw.gsub(/`([^`]+)`/im) { "`" + $1.gsub("<", "\u2603") + "`" } + .gsub("<", "<") + .gsub("\u2603", "<") + + raw = raw.gsub(/`([^`]+)`/im) { "`" + $1.gsub(">", "\u2603") + "`" } + .gsub(">", ">") + .gsub("\u2603", ">") + + # [URL=...]...[/URL] + raw = raw.gsub(/\[url="?(.+?)"?\](.+)\[\/url\]/i) { "[#{$2}](#{$1})" } + + # [URL]...[/URL] + # [MP3]...[/MP3] + raw = raw.gsub(/\[\/?url\]/i, "") + .gsub(/\[\/?mp3\]/i, "") + + # [MENTION][/MENTION] + raw = raw.gsub(/\[mention\](.+?)\[\/mention\]/i) do + old_username = $1 + if @old_username_to_new_usernames.has_key?(old_username) + old_username = @old_username_to_new_usernames[old_username] + end + "@#{old_username}" + end + + # [MENTION=][/MENTION] + # raw = raw.gsub(/\[mention="?(\d+)"?\](.+?)\[\/mention\]/i) do + # user_id, old_username = $1, $2 + # if user = @users.select { |u| u[:userid] == user_id }.first + # old_username = @old_username_to_new_usernames[user[:username]] || user[:username] + # end + # "@#{old_username}" + # end + + # [QUOTE]...[/QUOTE] + raw = raw.gsub(/\[quote\](.+?)\[\/quote\]/im) { "\n> #{$1}\n" } + + # [QUOTE=]...[/QUOTE] + raw = raw.gsub(/\[quote=([^;\]]+)\](.+?)\[\/quote\]/im) do + old_username, quote = $1, $2 + if @old_username_to_new_usernames.has_key?(old_username) + old_username = @old_username_to_new_usernames[old_username] + end + "\n[quote=\"#{old_username}\"]\n#{quote}\n[/quote]\n" + end + + # [YOUTUBE][/YOUTUBE] + raw = raw.gsub(/\[youtube\](.+?)\[\/youtube\]/i) { "\n//youtu.be/#{$1}\n" } + + # [VIDEO=youtube;]...[/VIDEO] + raw = raw.gsub(/\[video=youtube;([^\]]+)\].*?\[\/video\]/i) { "\n//youtu.be/#{$1}\n" } + + raw + end + + def postprocess_post_raw(raw) + # [QUOTE=;]...[/QUOTE] + raw = raw.gsub(/\[quote=([^;]+);(\d+)\](.+?)\[\/quote\]/im) do + old_username, post_id, quote = $1, $2, $3 + + if @old_username_to_new_usernames.has_key?(old_username) + old_username = @old_username_to_new_usernames[old_username] + end + + if topic_lookup = topic_lookup_from_imported_post_id(post_id) + post_number = topic_lookup[:post_number] + topic_id = topic_lookup[:topic_id] + "\n[quote=\"#{old_username},post:#{post_number},topic:#{topic_id}\"]\n#{quote}\n[/quote]\n" + else + "\n[quote=\"#{old_username}\"]\n#{quote}\n[/quote]\n" + end + end + + # remove attachments + raw = raw.gsub(/\[attach[^\]]*\]\d+\[\/attach\]/i, "") + + # [THREAD][/THREAD] + # ==> http://my.discourse.org/t/slug/ + raw = raw.gsub(/\[thread\](\d+)\[\/thread\]/i) do + thread_id = $1 + if topic_lookup = topic_lookup_from_imported_post_id("thread-#{thread_id}") + topic_lookup[:url] + else + $& + end + end + + # [THREAD=]...[/THREAD] + # ==> [...](http://my.discourse.org/t/slug/) + raw = raw.gsub(/\[thread=(\d+)\](.+?)\[\/thread\]/i) do + thread_id, link = $1, $2 + if topic_lookup = topic_lookup_from_imported_post_id("thread-#{thread_id}") + url = topic_lookup[:url] + "[#{link}](#{url})" + else + $& + end + end + + # [POST][/POST] + # ==> http://my.discourse.org/t/slug// + raw = raw.gsub(/\[post\](\d+)\[\/post\]/i) do + post_id = $1 + if topic_lookup = topic_lookup_from_imported_post_id(post_id) + topic_lookup[:url] + else + $& + end + end + + # [POST=]...[/POST] + # ==> [...](http://my.discourse.org/t///) + raw = raw.gsub(/\[post=(\d+)\](.+?)\[\/post\]/i) do + post_id, link = $1, $2 + if topic_lookup = topic_lookup_from_imported_post_id(post_id) + url = topic_lookup[:url] + "[#{link}](#{url})" + else + $& + end + end + + raw + end + + def parse_timestamp(timestamp) + Time.zone.at(@tz.utc_to_local(timestamp)) + end + + def fake_email + SecureRandom.hex << "@domain.com" + end + + def mysql_query(sql) + @client.query(sql, cache_rows: false) + end + +end + +ImportScripts::Lithium.new.perform diff --git a/script/import_scripts/support/convert_mysql_xml_to_mysql.rb b/script/import_scripts/support/convert_mysql_xml_to_mysql.rb new file mode 100644 index 00000000000..1b4e4eb7b87 --- /dev/null +++ b/script/import_scripts/support/convert_mysql_xml_to_mysql.rb @@ -0,0 +1,80 @@ +# convert huge XML dump to mysql friendly import +# + +require 'ox' +require 'set' + +class Saxy < Ox::Sax + + def initialize + @stack = [] + end + + def start_element(name) + @stack << {elem: name} + end + + def end_element(name) + @stack.pop + end + + def attr(name, value) + return unless @stack[-1] + (@stack[-1][:attrs] ||= {})[name] = value + end + + def text(val) + @stack[-1][:text] = val + end + + def cdata(val) + @stack[-1][:text] = val + end + +end + +class Convert < Saxy + def initialize(opts) + @tables = {} + @skip_data = Set.new(opts[:skip_data]) + super() + end + + + def end_element(name) + old = @stack.pop + cur = @stack[-1] + + if name == :field && cur[:elem] == :row + (cur[:row_data] ||= {})[old[:attrs][:name]] = old[:text] + elsif name == :row && cur[:elem] == :table_data + create_insert_statement(old) + elsif name == :field && cur[:elem] == :table_structure + (cur[:cols] ||= []) << old + elsif name == :table_structure + output_table_definition(old) + @tables[old[:name]] = old + end + end + + def output_table_definition(data) + cols = data[:cols].map do |col| + attrs = col[:attrs] + "#{attrs[:Field]} #{attrs[:Type]}" + end.join(", ") + puts "CREATE TABLE #{data[:attrs][:name]} (#{cols});" + end + + def create_insert_statement(data) + name = @stack[-1][:attrs][:name] + return if @skip_data.include?(name) + + row = data[:row_data] + col_names = row.keys.join(",") + vals = row.values.map{|v| "'#{v.gsub("'", "''").gsub('\\','\\\\\\')}'"}.join(",") + puts "INSERT INTO #{name} (#{col_names}) VALUES (#{vals});" + end +end + +Ox.sax_parse(Convert.new(skip_data: ['metrics2', 'user_log']), File.open(ARGV[0])) +