From 445b35381d82a9ab34a02e66484aa9e700da9872 Mon Sep 17 00:00:00 2001 From: Gerhard Schlager Date: Mon, 23 Mar 2020 09:13:36 +0100 Subject: [PATCH] Improve Telligent import script * Detects mostly all attachments and it's a lot faster * Parses user properties in Ruby instead of the DB, because that's less errorprone * Imports user avatars * Imports topic views by users * Better handling of quotes and YouTube links --- script/import_scripts/telligent.rb | 313 +++++++++++++++++++---------- 1 file changed, 207 insertions(+), 106 deletions(-) diff --git a/script/import_scripts/telligent.rb b/script/import_scripts/telligent.rb index 6fb3c775f0d..0e166523cdb 100644 --- a/script/import_scripts/telligent.rb +++ b/script/import_scripts/telligent.rb @@ -42,11 +42,43 @@ class ImportScripts::Telligent < ImportScripts::Base BATCH_SIZE ||= 1000 LOCAL_AVATAR_REGEX ||= /\A~\/.*(?communityserver-components-(?:selectable)?avatars)\/(?[^\/]+)\/(?.+)/i REMOTE_AVATAR_REGEX ||= /\Ahttps?:\/\//i - EMBEDDED_ATTACHMENT_REGEX ||= /(?.*?)<\/a>/i + EMBEDDED_ATTACHMENT_REGEX ||= /.*?<\/a>/i + EMBEDDED_VIEW_ATTACHMENT_REGEX ||= /\[View:~\/cfs-file(?:\.ashx)?\/__key\/(?[^\/]+)\/(?[^\/]+)\/(?.+?)(?:\:[:\d\s]*?)\]/i + PROPERTY_NAMES_REGEX ||= /(?\w+):S:(?\d+):(?\d+):/ CATEGORY_LINK_NORMALIZATION = '/.*?(f\/\d+)$/\1' TOPIC_LINK_NORMALIZATION = '/.*?(f\/\d+\/t\/\d+)$/\1' + UNICODE_REPLACEMENTS = { + "5F00" => "_", + "2800" => "(", + "2900" => ")", + "2D00" => "-", + "2C00" => ",", + "2700" => "'", + "5B00" => "[", + "5D00" => "]", + "3D00" => "=", + "2600" => "&", + "2100" => "!", + "2300" => "#", + "7E00" => "~", + "2500" => "%", + "2E00" => ".", + "4000" => "@", + "2B00" => "+", + "2400" => "$", + "1920" => "’", + "E900" => "é", + "E000" => "à", + "F300" => "ó", + "1C20" => "“", + "1D20" => "”", + "B000" => "°", + "0003" => ["0300".to_i(16)].pack("U"), + "0103" => ["0301".to_i(16)].pack("U") + } + def initialize super() @@ -58,11 +90,16 @@ class ImportScripts::Telligent < ImportScripts::Base timeout: 60 # the user query is very slow ) + @filestore_root_directory = ENV["FILE_BASE_DIR"] + @files = {} + SiteSetting.tagging_enabled = true end def execute add_permalink_normalizations + index_filestore + import_categories import_users import_topics @@ -70,6 +107,11 @@ class ImportScripts::Telligent < ImportScripts::Base mark_topics_as_solved end + def index_filestore + puts "", "Indexing filestore..." + index_directory(@filestore_root_directory) + end + def import_users puts "", "Importing users..." @@ -94,32 +136,14 @@ class ImportScripts::Telligent < ImportScripts::Base loop do rows = query(<<~SQL) - SELECT * - FROM ( - SELECT TOP #{BATCH_SIZE} - u.UserID, u.Email, u.UserName,u.CreateDate, p.PropertyName, p.PropertyValue - FROM cs_Users u - LEFT OUTER JOIN ( - SELECT NULL AS UserID, ap.UserId AS MembershipID, x.PropertyName, x.PropertyValue - FROM aspnet_Profile ap - CROSS APPLY dbo.GetProperties(ap.PropertyNames, ap.PropertyValuesString) x - WHERE ap.PropertyNames NOT LIKE '%:-1%' AND - x.PropertyName IN ('bio', 'commonName', 'location', 'webAddress') - UNION - SELECT up.UserID, NULL AS MembershipID, x.PropertyName, CAST(x.PropertyValue AS NVARCHAR) AS PropertyValue - FROM cs_UserProfile up - CROSS APPLY dbo.GetProperties(up.PropertyNames, up.PropertyValues) x - WHERE up.PropertyNames NOT LIKE '%:-1%' AND - x.PropertyName IN ('avatarUrl', 'BannedUntil', 'UserBanReason') - ) p ON p.UserID = u.UserID OR p.MembershipID = u.MembershipID - WHERE u.UserID > #{last_user_id} AND #{user_conditions} - ORDER BY u.UserID - ) x - PIVOT ( - MAX(PropertyValue) - FOR PropertyName - IN (bio, commonName, location, webAddress, avatarUrl, BannedUntil, UserBanReason) - ) Y + SELECT TOP #{BATCH_SIZE} + u.UserID, u.Email, u.UserName, u.CreateDate, + ap.PropertyNames AP_PropertyNames, ap.PropertyValuesString AS AP_PropertyValues, + up.PropertyNames UP_PropertyNames, up.PropertyValues AS UP_PropertyValues + FROM cs_Users u + LEFT OUTER JOIN aspnet_Profile ap ON ap.UserId = u.MembershipID + LEFT OUTER JOIN cs_UserProfile up ON up.UserID = u.UserID + WHERE u.UserID > #{last_user_id} AND #{user_conditions} ORDER BY UserID SQL @@ -132,18 +156,21 @@ class ImportScripts::Telligent < ImportScripts::Base end create_users(rows, total: total_count, offset: import_count) do |row| + ap_properties = parse_properties(row["AP_PropertyNames"], row["AP_PropertyValues"]) + up_properties = parse_properties(row["UP_PropertyNames"], row["UP_PropertyValues"]) + { id: row["UserID"], email: row["Email"], username: row["UserName"], - name: row["commonName"], + name: ap_properties["commonName"], created_at: row["CreateDate"], - bio_raw: html_to_markdown(row["bio"]), - location: row["location"], - website: row["webAddress"], + bio_raw: html_to_markdown(ap_properties["bio"]), + location: ap_properties["location"], + website: ap_properties["webAddress"], post_create_action: proc do |user| - import_avatar(user, row["avatarUrl"]) - suspend_user(user, row["BannedUntil"], row["UserBanReason"]) + import_avatar(user, up_properties["avatarUrl"]) + suspend_user(user, up_properties["BannedUntil"], up_properties["UserBanReason"]) end } end @@ -154,10 +181,10 @@ class ImportScripts::Telligent < ImportScripts::Base # TODO move into base importer (create_user) and use consistent error handling def import_avatar(user, avatar_url) - return if ENV["FILE_BASE_DIR"].blank? || avatar_url.blank? || avatar_url.include?("anonymous") + return if @filestore_root_directory.blank? || avatar_url.blank? || avatar_url.include?("anonymous") if match_data = avatar_url.match(LOCAL_AVATAR_REGEX) - avatar_path = File.join(ENV["FILE_BASE_DIR"], + avatar_path = File.join(@filestore_root_directory, match_data[:directory].gsub("-", "."), match_data[:path].split("-"), match_data[:filename]) @@ -333,7 +360,7 @@ class ImportScripts::Telligent < ImportScripts::Base batches do |offset| rows = query(<<~SQL) SELECT TOP #{BATCH_SIZE} - t.ThreadId, t.ForumId, t.UserId, t.TotalViews, + t.ThreadId, t.ForumId, t.UserId, t.TotalViews, t.ContentID AS TopicContentId, t.Subject, t.Body, t.DateCreated, t.IsLocked, t.StickyDate, a.ApplicationTypeId, a.ApplicationId, a.ApplicationContentTypeId, a.ContentId, a.FileName, a.IsRemote FROM te_Forum_Threads t @@ -365,6 +392,7 @@ class ImportScripts::Telligent < ImportScripts::Base Jobs.enqueue_at(topic.pinned_until, :unpin_topic, topic_id: topic.id) if topic.pinned_until url = "f/#{row['ForumId']}/t/#{row['ThreadId']}" Permalink.create(url: url, topic_id: topic.id) unless Permalink.exists?(url: url) + import_topic_views(topic, row["TopicContentId"]) end } @@ -382,6 +410,29 @@ class ImportScripts::Telligent < ImportScripts::Base "T#{topic_id}" end + def import_topic_views(topic, content_id) + last_user_id = -1 + + batches do |_| + rows = query(<<~SQL) + SELECT TOP #{BATCH_SIZE} + UserId, MAX(CreatedUtcDate) AS ViewDate + FROM te_Content_Views + WHERE ContentId = '#{content_id}' AND UserId > #{last_user_id} + GROUP BY UserId + ORDER BY UserId + SQL + + break if rows.blank? + last_user_id = rows[-1]["UserId"] + + rows.each do |row| + user_id = user_id_from_imported_user_id(row["UserId"]) + TopicViewItem.add(topic.id, "127.0.0.1", user_id, row["ViewDate"], true) if user_id + end + end + end + def ignored_forum_sql_condition @ignored_forum_sql_condition ||= @ignored_forum_ids.present? \ ? "t.ForumId NOT IN (#{@ignored_forum_ids.join(',')})" \ @@ -452,28 +503,42 @@ class ImportScripts::Telligent < ImportScripts::Base end end + def index_directory(root_directory) + Dir.foreach(root_directory) do |directory_name| + next if directory_name == "." || directory_name == ".." + + path = File.join(root_directory, directory_name) + if File.directory?(path) + index_directory(path) + else + path.delete_prefix!(@filestore_root_directory) + path.delete_prefix!("/") + @files[path.downcase] = path + end + end + end + def raw_with_attachment(row, user_id, type) raw, embedded_paths, upload_ids = replace_embedded_attachments(row["Body"], user_id) raw = html_to_markdown(raw) || "" filename = row["FileName"] - return raw if ENV["FILE_BASE_DIR"].blank? || filename.blank? + return raw if @filestore_root_directory.blank? || filename.blank? if row["IsRemote"] return "#{raw}\n#{filename}" end path = File.join( - ENV["FILE_BASE_DIR"], "telligent.evolution.components.attachments", "%02d" % row["ApplicationTypeId"], "%02d" % row["ApplicationId"], "%02d" % row["ApplicationContentTypeId"], - ("%010d" % row["ContentId"]).scan(/.{2}/), - clean_filename(filename) + ("%010d" % row["ContentId"]).scan(/.{2}/) ) + path = fix_attachment_path(path, filename) - unless embedded_paths.include?(path) + if path && !embedded_paths.include?(path) if File.file?(path) upload = @uploader.create_upload(user_id, path, filename) @@ -493,91 +558,110 @@ class ImportScripts::Telligent < ImportScripts::Base paths = [] upload_ids = [] - return [raw, paths, upload_ids] if ENV["FILE_BASE_DIR"].blank? + return [raw, paths, upload_ids] if @filestore_root_directory.blank? - raw = raw.gsub(EMBEDDED_ATTACHMENT_REGEX) do - filename, path = attachment_path(Regexp.last_match) + [EMBEDDED_ATTACHMENT_REGEX, EMBEDDED_VIEW_ATTACHMENT_REGEX].each do |regex| + raw = raw.gsub(regex) do + match_data = Regexp.last_match - if File.file?(path) - upload = @uploader.create_upload(user_id, path, filename) + path = File.join(match_data[:directory], match_data[:path]) + fixed_path = fix_attachment_path(path, match_data[:filename]) - if upload.present? && upload.persisted? - paths << path - upload_ids << upload.id - @uploader.html_for_upload(upload, filename) + if fixed_path && File.file?(fixed_path) + filename = File.basename(fixed_path) + upload = @uploader.create_upload(user_id, fixed_path, filename) + + if upload.present? && upload.persisted? + paths << fixed_path + upload_ids << upload.id + @uploader.html_for_upload(upload, filename) + end + else + path = File.join(path, match_data[:filename]) + STDERR.puts "Could not find file: #{path}" end - else - STDERR.puts "Could not find file: #{path}" end end [raw, paths, upload_ids] end - def clean_filename(filename) + def fix_attachment_path(base_path, filename) + path = find_correct_path(base_path, filename) + return path if attachment_exists?(path) + + base_path.downcase! + path = find_correct_path(base_path, filename) + return path if attachment_exists?(path) + filename = CGI.unescapeHTML(filename) - filename.gsub!(/[\x00\/\\:\*\?\"<>\|]/, '_') + path = find_correct_path(base_path, filename) + return path if attachment_exists?(path) - %w|( ) # % - _ [ ] = , ' ~ ! + { } & @ #|.each do |c| - number = "#{c.ord.to_s(16).upcase}00" - filename.gsub!("_#{number}_", c) - filename.gsub!("_#{number}#{number}_", c * 2) - end + filename.gsub!("-", " ") + filename.strip! + path = find_correct_path(base_path, filename) + return path if attachment_exists?(path) - filename + directories = base_path.split(File::SEPARATOR) + first_directory = directories.shift + first_directory.gsub!("-", ".") + base_path = File.join(first_directory, directories) + path = find_correct_path(base_path, filename) + return path if attachment_exists?(path) + + directories.map! { |d| File.join(d.split(/[\.\-]/).map(&:strip)) } + base_path = File.join(first_directory, directories) + path = find_correct_path(base_path, filename) + return path if attachment_exists?(path) + + directories = base_path.split(File::SEPARATOR) + directories.map! { |d| d.gsub("+", " ").strip } + base_path = File.join(directories) + path = find_correct_path(base_path, filename) + return path if attachment_exists?(path) + + replace_codes!(filename) + path = find_correct_path(base_path, filename) + return path if attachment_exists?(path) + + replace_codes!(base_path) + path = find_correct_path(base_path, filename) + return path if attachment_exists?(path) + + filename.gsub!(/(?:\:\d+)+$/, "") + path = find_correct_path(base_path, filename) + return path if attachment_exists?(path) + + path = File.join(base_path, filename) + path_regex = Regexp.new("^#{Regexp.escape(path)}-\\d+x\\d+\\.\\w+$", Regexp::IGNORECASE) + path = find_correct_path_with_regex(path_regex) + return path if attachment_exists?(path) + + nil end - def attachment_path(match_data) - filename, path = join_attachment_path(match_data, filename_index: 2) - filename, path = join_attachment_path(match_data, filename_index: 1) unless File.file?(path) - [filename, path] + def find_correct_path(base_path, filename) + path = File.join(base_path, filename) + path = @files[path.downcase] + path ? File.join(@filestore_root_directory, path) : nil end - # filenames are a total mess - try to guess the correct filename - # works for 70% of all files - def join_attachment_path(match_data, filename_index:) - filename = clean_filename(match_data[:"filename#{filename_index}"]) - base_path = File.join( - ENV["FILE_BASE_DIR"], - match_data[:directory].gsub("-", ".").downcase, - match_data[:path].gsub("+", " ").gsub(/_\h{4}_/, "?").split(/[\.\-]/).map(&:strip) - ) + def find_correct_path_with_regex(regex) + keys = @files.keys.filter { |key| regex =~ key } + keys.size == 1 ? File.join(@filestore_root_directory, @files[keys.first]) : nil + end - original_filename = filename.dup + def attachment_exists?(path) + path.present? && File.file?(path) + end - filename.gsub!(/[_\- ]/, "?") - path = File.join(base_path, filename) - paths = Dir.glob(path, File::FNM_CASEFOLD) - if (path = paths.first) && paths.size == 1 - filename = File.basename(path) - return [filename, path] + def replace_codes!(text) + text.gsub!(/_(\h{4}+)_/i) do + codes = Regexp.last_match[1].upcase.scan(/.{4}/) + mapped_codes = codes.map { |c| UNICODE_REPLACEMENTS[c] } + mapped_codes.any? { |c| c.nil? } ? Regexp.last_match[0] : mapped_codes.join("") end - - filename = original_filename.dup - filename.gsub!(/_\h{2}00_/, "?") - filename.gsub!(/_\h{2}00\h{2}00_/, "??") - filename.gsub!(/_\h{2}00\h{2}00\h{2}00_/, "???") - filename.gsub!(/[_\- ]/, "?") - path = File.join(base_path, filename) - paths = Dir.glob(path, File::FNM_CASEFOLD) - if (path = paths.first) && paths.size == 1 - filename = File.basename(path) - return [filename, path] - end - - filename = original_filename.dup - filename.gsub!(/_\h{4}_/, "?") - filename.gsub!(/_\h{4}\h{4}_/, "??") - filename.gsub!(/_\h{4}\h{4}\h{4}_/, "???") - filename.gsub!(/[_\- ]/, "?") - path = File.join(base_path, filename) - paths = Dir.glob(path, File::FNM_CASEFOLD) - if (path = paths.first) && paths.size == 1 - filename = File.basename(path) - return [filename, path] - end - - [original_filename, File.join(base_path, original_filename)] end def html_to_markdown(html) @@ -586,10 +670,27 @@ class ImportScripts::Telligent < ImportScripts::Base md = HtmlToMarkdown.new(html).to_markdown md.gsub!(/\[quote.*?\]/, "\n" + '\0' + "\n") md.gsub!(/(?