Improvements to phpBB3 import script (#10999)

* FEATURE: Import attachments

* FEATURE: Add support for importing multiple forums in one

* FEATURE: Add support for category and tag mapping

* FEATURE: Import groups

* FIX: Add spaces around images

* FEATURE: Custom mapping of user rank to trust levels

* FIX: Do not fail import if it cannot import polls

* FIX: Optimize existing records lookup

Co-authored-by: Gerhard Schlager <mail@gerhard-schlager.at>
Co-authored-by: Jarek Radosz <jradosz@gmail.com>
This commit is contained in:
Bianca Nenciu
2021-01-14 21:44:43 +02:00
committed by GitHub
parent 82af278ae5
commit a71b219c9a
20 changed files with 1606 additions and 79 deletions

View File

@ -606,10 +606,15 @@ class ImportScripts::Base
skipped += 1
puts "Skipping bookmark for user id #{params[:user_id]} and post id #{params[:post_id]}"
else
result = BookmarkManager.new(user).create(post_id: post.id)
begin
manager = BookmarkManager.new(user)
bookmark = manager.create(post_id: post.id)
created += 1 if result.errors.none?
skipped += 1 if result.errors.any?
created += 1 if manager.errors.none?
skipped += 1 if manager.errors.any?
rescue
skipped += 1
end
end
end

View File

@ -57,6 +57,11 @@ module ImportScripts
UserCustomField.where(name: 'import_id', value: import_id.to_s).first.try(:user)
end
def find_username_by_import_id(import_id)
user_id = user_id_from_imported_user_id(import_id)
User.where(id: user_id).pluck(:username).first if user_id.present?
end
# Get the Discourse Category id based on the id of the source category
def category_id_from_imported_category_id(import_id)
@categories[import_id] || @categories[import_id.to_s]

View File

@ -22,13 +22,13 @@ module ImportScripts::PhpBB3
if version.start_with?('3.0')
require_relative 'database_3_0'
Database_3_0.new(@database_client, @database_settings)
elsif version.start_with?('3.1')
elsif version.start_with?('3.1') || version.start_with?('3.2')
require_relative 'database_3_1'
Database_3_1.new(@database_client, @database_settings)
else
raise UnsupportedVersionError, <<~MSG
Unsupported version (#{version}) of phpBB detected.
Currently only 3.0.x and 3.1.x are supported by this importer.
Currently only version 3.0, 3.1 and 3.2 are supported by this importer.
MSG
end
end

View File

@ -53,6 +53,20 @@ module ImportScripts::PhpBB3
SQL
end
def fetch_groups
query(<<-SQL)
SELECT g.group_id, g.group_type, g.group_name, g.group_desc
FROM #{@table_prefix}groups g
SQL
end
def fetch_group_users
query(<<-SQL)
SELECT ug.group_id, ug.user_id, ug.group_leader
FROM #{@table_prefix}user_group ug
SQL
end
def fetch_categories
query(<<-SQL)
SELECT f.forum_id, f.parent_id, f.forum_name, f.forum_desc, x.first_post_time
@ -213,12 +227,20 @@ module ImportScripts::PhpBB3
SELECT b.user_id, t.topic_first_post_id
FROM #{@table_prefix}bookmarks b
JOIN #{@table_prefix}topics t ON (b.topic_id = t.topic_id)
WHERE b.user_id > #{last_user_id} AND b.topic_id > #{last_topic_id}
WHERE b.user_id > #{last_user_id}
ORDER BY b.user_id, b.topic_id
LIMIT #{@batch_size}
SQL
end
def get_smiley(smiley_code)
query(<<-SQL).first
SELECT emotion, smiley_url
FROM #{@table_prefix}smilies
WHERE code = '#{smiley_code}'
SQL
end
def get_config_values
query(<<-SQL).first
SELECT

View File

@ -27,8 +27,13 @@ module ImportScripts::PhpBB3
def execute
puts '', "importing from phpBB #{@php_config[:phpbb_version]}"
SiteSetting.tagging_enabled = true if @settings.tag_mappings.present?
import_users
import_anonymous_users if @settings.import_anonymous_users
import_groups
import_user_groups
import_new_categories
import_categories
import_posts
import_private_messages if @settings.import_private_messages
@ -67,12 +72,12 @@ module ImportScripts::PhpBB3
batches do |offset|
rows, last_user_id = @database.fetch_users(last_user_id)
rows = rows.to_a.uniq { |row| row[:user_id] }
break if rows.size < 1
next if all_records_exist?(:users, importer.map_users_to_import_ids(rows))
create_users(rows, total: total_count, offset: offset) do |row|
begin
next if user_id_from_imported_user_id(@settings.prefix(row[:user_id]))
importer.map_user(row)
rescue => e
log_error("Failed to map user with ID #{row[:user_id]}", e)
@ -91,10 +96,9 @@ module ImportScripts::PhpBB3
rows, last_username = @database.fetch_anonymous_users(last_username)
break if rows.size < 1
next if all_records_exist?(:users, importer.map_anonymous_users_to_import_ids(rows))
create_users(rows, total: total_count, offset: offset) do |row|
begin
next if user_id_from_imported_user_id(@settings.prefix(row[:post_username]))
importer.map_anonymous_user(row)
rescue => e
log_error("Failed to map anonymous user with ID #{row[:user_id]}", e)
@ -103,12 +107,74 @@ module ImportScripts::PhpBB3
end
end
def import_groups
puts '', 'creating groups'
rows = @database.fetch_groups
create_groups(rows) do |row|
begin
next if row[:group_type] == 3
group_name = if @settings.site_name.present?
"#{@settings.site_name}_#{row[:group_name]}"
else
row[:group_name]
end[0..19].gsub(/[^a-zA-Z0-9\-_. ]/, '_')
bio_raw = @importers.text_processor.process_raw_text(row[:group_desc]) rescue row[:group_desc]
{
id: @settings.prefix(row[:group_id]),
name: group_name,
full_name: row[:group_name],
bio_raw: bio_raw
}
rescue => e
log_error("Failed to map group with ID #{row[:group_id]}", e)
end
end
end
def import_user_groups
puts '', 'creating user groups'
rows = @database.fetch_group_users
rows.each do |row|
group_id = @lookup.group_id_from_imported_group_id(@settings.prefix(row[:group_id]))
next if !group_id
user_id = @lookup.user_id_from_imported_user_id(@settings.prefix(row[:user_id]))
begin
GroupUser.find_or_create_by(user_id: user_id, group_id: group_id, owner: row[:group_leader])
rescue => e
log_error("Failed to add user #{row[:user_id]} to group #{row[:group_id]}", e)
end
end
end
def import_new_categories
puts '', 'creating new categories'
create_categories(@settings.new_categories) do |row|
next if row == "SKIP"
{
id: @settings.prefix(row[:forum_id]),
name: row[:name],
parent_category_id: @lookup.category_id_from_imported_category_id(@settings.prefix(row[:parent_id]))
}
end
end
def import_categories
puts '', 'creating categories'
rows = @database.fetch_categories
importer = @importers.category_importer
create_categories(rows) do |row|
next if @settings.category_mappings[row[:forum_id].to_s] == 'SKIP'
importer.map_category(row)
end
end
@ -123,10 +189,9 @@ module ImportScripts::PhpBB3
rows, last_post_id = @database.fetch_posts(last_post_id)
break if rows.size < 1
next if all_records_exist?(:posts, importer.map_to_import_ids(rows))
create_posts(rows, total: total_count, offset: offset) do |row|
begin
next if post_id_from_imported_post_id(@settings.prefix(row[:post_id]))
importer.map_post(row)
rescue => e
log_error("Failed to map post with ID #{row[:post_id]}", e)
@ -145,10 +210,9 @@ module ImportScripts::PhpBB3
rows, last_msg_id = @database.fetch_messages(last_msg_id)
break if rows.size < 1
next if all_records_exist?(:posts, importer.map_to_import_ids(rows))
create_posts(rows, total: total_count, offset: offset) do |row|
begin
next if post_id_from_imported_post_id(@settings.prefix("pm:#{row[:msg_id]}"))
importer.map_message(row)
rescue => e
log_error("Failed to map message with ID #{row[:msg_id]}", e)
@ -168,7 +232,11 @@ module ImportScripts::PhpBB3
break if rows.size < 1
create_bookmarks(rows, total: total_count, offset: offset) do |row|
importer.map_bookmark(row)
begin
importer.map_bookmark(row)
rescue => e
log_error("Failed to map bookmark (#{row[:user_id]}, #{row[:topic_first_post_id]})", e)
end
end
end
end

View File

@ -2,10 +2,14 @@
module ImportScripts::PhpBB3
class BookmarkImporter
def initialize(settings)
@settings = settings
end
def map_bookmark(row)
{
user_id: row[:user_id],
post_id: row[:topic_first_post_id]
user_id: @settings.prefix(row[:user_id]),
post_id: @settings.prefix(row[:topic_first_post_id])
}
end
end

View File

@ -5,20 +5,28 @@ module ImportScripts::PhpBB3
# @param lookup [ImportScripts::LookupContainer]
# @param text_processor [ImportScripts::PhpBB3::TextProcessor]
# @param permalink_importer [ImportScripts::PhpBB3::PermalinkImporter]
def initialize(lookup, text_processor, permalink_importer)
# @param settings [ImportScripts::PhpBB3::Settings]
def initialize(lookup, text_processor, permalink_importer, settings)
@lookup = lookup
@text_processor = text_processor
@permalink_importer = permalink_importer
@settings = settings
end
def map_category(row)
return if @settings.category_mappings[row[:forum_id].to_s]
if row[:parent_id] && @settings.category_mappings[row[:parent_id].to_s]
puts "parent category (#{row[:parent_id]}) was mapped, but children was not (#{row[:forum_id]})"
end
{
id: row[:forum_id],
id: @settings.prefix(row[:forum_id]),
name: CGI.unescapeHTML(row[:forum_name]),
parent_category_id: @lookup.category_id_from_imported_category_id(row[:parent_id]),
parent_category_id: @lookup.category_id_from_imported_category_id(@settings.prefix(row[:parent_id])),
post_create_action: proc do |category|
update_category_description(category, row)
@permalink_importer.create_for_category(category, row[:forum_id])
@permalink_importer.create_for_category(category, row[:forum_id]) # skip @settings.prefix because ID is used in permalink generation
end
}
end
@ -43,7 +51,7 @@ module ImportScripts::PhpBB3
end
if row[:forum_desc].present?
changes = { raw: @text_processor.process_raw_text(row[:forum_desc]) }
changes = { raw: (@text_processor.process_raw_text(row[:forum_desc]) rescue row[:forum_desc]) }
opts = { revised_at: post.created_at, bypass_bump: true }
post.revise(Discourse.system_user, changes, opts)
end

View File

@ -32,7 +32,7 @@ module ImportScripts::PhpBB3
end
def category_importer
CategoryImporter.new(@lookup, text_processor, permalink_importer)
CategoryImporter.new(@lookup, text_processor, permalink_importer, @settings)
end
def post_importer
@ -44,15 +44,13 @@ module ImportScripts::PhpBB3
end
def bookmark_importer
BookmarkImporter.new
BookmarkImporter.new(@settings)
end
def permalink_importer
@permalink_importer ||= PermalinkImporter.new(@settings.permalinks)
end
protected
def attachment_importer
AttachmentImporter.new(@database, @uploader, @settings, @phpbb_config)
end
@ -62,15 +60,15 @@ module ImportScripts::PhpBB3
end
def poll_importer
PollImporter.new(@lookup, @database, text_processor)
PollImporter.new(@lookup, @database, text_processor, @settings)
end
def text_processor
@text_processor ||= TextProcessor.new(@lookup, @database, smiley_processor, @settings)
@text_processor ||= TextProcessor.new(@lookup, @database, smiley_processor, @settings, @phpbb_config)
end
def smiley_processor
SmileyProcessor.new(@uploader, @settings, @phpbb_config)
SmileyProcessor.new(@uploader, @database, @settings, @phpbb_config)
end
end
end

View File

@ -20,7 +20,7 @@ module ImportScripts::PhpBB3
end
def map_message(row)
user_id = @lookup.user_id_from_imported_user_id(row[:author_id]) || Discourse.system_user.id
user_id = @lookup.user_id_from_imported_user_id(@settings.prefix(row[:author_id])) || Discourse.system_user.id
attachments = import_attachments(row, user_id)
mapped = {
@ -84,7 +84,7 @@ module ImportScripts::PhpBB3
import_user_ids = get_recipient_user_ids(row[:to_address])
import_user_ids.map! do |import_user_id|
@lookup.find_user_by_import_id(import_user_id).try(:username)
@lookup.find_user_by_import_id(@settings.prefix(import_user_id)).try(:username)
end.compact
end
@ -93,7 +93,7 @@ module ImportScripts::PhpBB3
end
def get_import_id(msg_id)
"pm:#{msg_id}"
@settings.prefix("pm:#{msg_id}")
end
# Creates a sorted array consisting of the message's author and recipients.

View File

@ -39,7 +39,7 @@ module ImportScripts::PhpBB3
end
def create_for_post(post, import_id)
return unless @settings.create_topic_links && post
return unless @settings.create_post_links && post
url = "viewtopic.php?p=#{import_id}"

View File

@ -5,10 +5,12 @@ module ImportScripts::PhpBB3
# @param lookup [ImportScripts::LookupContainer]
# @param database [ImportScripts::PhpBB3::Database_3_0 | ImportScripts::PhpBB3::Database_3_1]
# @param text_processor [ImportScripts::PhpBB3::TextProcessor]
def initialize(lookup, database, text_processor)
# @param settings [ImportScripts::PhpBB3::Settings]
def initialize(lookup, database, text_processor, settings)
@lookup = lookup
@database = database
@text_processor = text_processor
@settings = settings
end
# @param poll_data [ImportScripts::PhpBB3::PollData]
@ -47,7 +49,7 @@ module ImportScripts::PhpBB3
end
def get_option_text(row)
text = @text_processor.process_raw_text(row[:poll_option_text])
text = @text_processor.process_raw_text(row[:poll_option_text]) rescue row[:poll_option_text]
text.squish!
text.gsub!(/^(\d+)\./, '\1\.')
text
@ -55,7 +57,7 @@ module ImportScripts::PhpBB3
# @param poll_data [ImportScripts::PhpBB3::PollData]
def get_poll_text(poll_data)
title = @text_processor.process_raw_text(poll_data.title)
title = @text_processor.process_raw_text(poll_data.title) rescue poll_data.title
text = +"#{title}\n\n"
arguments = ["results=always"]
@ -118,7 +120,7 @@ module ImportScripts::PhpBB3
rows.each do |row|
option_id = mapped_option_ids[row[:poll_option_id]]
user_id = @lookup.user_id_from_imported_user_id(row[:user_id])
user_id = @lookup.user_id_from_imported_user_id(@settings.prefix(row[:user_id]))
if option_id.present? && user_id.present?
PollVote.create!(poll: poll, poll_option_id: option_id, user_id: user_id)

View File

@ -18,22 +18,24 @@ module ImportScripts::PhpBB3
end
def map_to_import_ids(rows)
rows.map { |row| row[:post_id] }
rows.map { |row| @settings.prefix(row[:post_id]) }
end
def map_post(row)
imported_user_id = row[:post_username].blank? ? row[:poster_id] : row[:post_username]
return if @settings.category_mappings[row[:forum_id].to_s] == 'SKIP'
imported_user_id = @settings.prefix(row[:post_username].blank? ? row[:poster_id] : row[:post_username])
user_id = @lookup.user_id_from_imported_user_id(imported_user_id) || -1
is_first_post = row[:post_id] == row[:topic_first_post_id]
attachments = import_attachments(row, user_id)
mapped = {
id: row[:post_id],
id: @settings.prefix(row[:post_id]),
user_id: user_id,
created_at: Time.zone.at(row[:post_time]),
raw: @text_processor.process_post(row[:post_text], attachments),
import_topic_id: row[:topic_id]
import_topic_id: @settings.prefix(row[:topic_id])
}
if is_first_post
@ -54,14 +56,18 @@ module ImportScripts::PhpBB3
def map_first_post(row, mapped)
poll_data = add_poll(row, mapped) if @settings.import_polls
mapped[:category] = @lookup.category_id_from_imported_category_id(row[:forum_id])
mapped[:category] = @lookup.category_id_from_imported_category_id(@settings.prefix(@settings.category_mappings[row[:forum_id].to_s])) ||
@lookup.category_id_from_imported_category_id(@settings.prefix(row[:forum_id]))
mapped[:title] = CGI.unescapeHTML(row[:topic_title]).strip[0...255]
mapped[:pinned_at] = mapped[:created_at] unless row[:topic_type] == Constants::POST_NORMAL
mapped[:pinned_globally] = row[:topic_type] == Constants::POST_GLOBAL
mapped[:views] = row[:topic_views]
mapped[:post_create_action] = proc do |post|
@permalink_importer.create_for_topic(post.topic, row[:topic_id])
@permalink_importer.create_for_post(post, row[:post_id])
if tags = @settings.tag_mappings[row[:forum_id].to_s].presence
DiscourseTagging.tag_topic_by_names(post.topic, staff_guardian, tags)
end
@permalink_importer.create_for_topic(post.topic, row[:topic_id]) # skip @settings.prefix because ID is used in permalink generation
@permalink_importer.create_for_post(post, row[:post_id]) # skip @settings.prefix because ID is used in permalink generation
@poll_importer.update_poll(row[:topic_id], post, poll_data) if poll_data
TopicViewItem.add(post.topic_id, row[:poster_ip], post.user_id, post.created_at, true)
end
@ -70,16 +76,16 @@ module ImportScripts::PhpBB3
end
def map_other_post(row, mapped)
parent = @lookup.topic_lookup_from_imported_post_id(row[:topic_first_post_id])
parent = @lookup.topic_lookup_from_imported_post_id(@settings.prefix(row[:topic_first_post_id]))
if parent.blank?
puts "Parent post #{row[:topic_first_post_id]} doesn't exist. Skipping #{row[:post_id]}: #{row[:topic_title][0..40]}"
puts "Parent post #{@settings.prefix(row[:topic_first_post_id])} doesn't exist. Skipping #{@settings.prefix(row[:post_id])}: #{row[:topic_title][0..40]}"
return nil
end
mapped[:topic_id] = parent[:topic_id]
mapped[:post_create_action] = proc do |post|
@permalink_importer.create_for_post(post, row[:post_id])
@permalink_importer.create_for_post(post, row[:post_id]) # skip @settings.prefix because ID is used in permalink generation
TopicViewItem.add(post.topic_id, row[:poster_ip], post.user_id, post.created_at, true)
end
@ -91,9 +97,14 @@ module ImportScripts::PhpBB3
poll_data = PollData.new(row[:poll_title], row[:poll_max_options], row[:poll_end])
poll_raw = @poll_importer.create_raw(row[:topic_id], poll_data)
return if poll_data.options.size < 2
mapped_post[:raw] = poll_raw << "\n\n" << mapped_post[:raw]
poll_data
end
def staff_guardian
@_staff_guardian ||= Guardian.new(Discourse.system_user)
end
end
end

View File

@ -12,14 +12,18 @@ module ImportScripts::PhpBB3
end
def map_users_to_import_ids(rows)
rows.map { |row| row[:user_id] }
rows.map { |row| @settings.prefix(row[:user_id]) }
end
def map_user(row)
is_active_user = row[:user_inactive_reason] != Constants::INACTIVE_REGISTER
trust_level = row[:user_posts] == 0 ? TrustLevel[0] : TrustLevel[1]
trust_level = @settings.trust_level_for_posts(row[:user_posts], trust_level: trust_level)
manual_locked_trust_level = trust_level > TrustLevel[1] ? trust_level : nil
{
id: row[:user_id],
id: @settings.prefix(row[:user_id]),
email: row[:user_email],
username: row[:username],
password: @settings.import_passwords ? row[:user_password] : nil,
@ -28,7 +32,8 @@ module ImportScripts::PhpBB3
last_seen_at: row[:user_lastvisit] == 0 ? Time.zone.at(row[:user_regdate]) : Time.zone.at(row[:user_lastvisit]),
registration_ip_address: (IPAddr.new(row[:user_ip]) rescue nil),
active: is_active_user,
trust_level: row[:user_posts] == 0 ? TrustLevel[0] : TrustLevel[1],
trust_level: trust_level,
manual_locked_trust_level: manual_locked_trust_level,
approved: is_active_user,
approved_by_id: is_active_user ? Discourse.system_user.id : nil,
approved_at: is_active_user ? Time.now : nil,
@ -45,14 +50,14 @@ module ImportScripts::PhpBB3
end
def map_anonymous_users_to_import_ids(rows)
rows.map { |row| row[:post_username] }
rows.map { |row| @settings.prefix(row[:post_username]) }
end
def map_anonymous_user(row)
username = row[:post_username]
{
id: username,
id: @settings.prefix(username),
email: "anonymous_#{SecureRandom.hex}@no-email.invalid",
username: username,
name: @settings.username_as_name ? username : '',

View File

@ -11,6 +11,73 @@ database:
batch_size: 1000 # Don't change this unless you know what you're doing. The default (1000) should work just fine.
import:
# Set this if you import multiple phpBB forums into a single Discourse forum.
#
# For example, when importing multiple sites, prefix all imported IDs
# with 'first' to avoid conflicts. Subsequent import runs must have a
# different 'site_name'.
#
# site_name: first
#
site_name:
# Create new categories
#
# For example, to create a parent category and a subcategory.
#
# new_categories:
# - forum_id: foo
# name: Foo Category
# - forum_id: bar
# name: Bar Category
# parent_id: foo
#
new_categories:
# Category mappings
#
# For example, topics from phpBB category 1 and 2 will be imported
# in the new "Foo Category" category, topics from phpBB category 3
# will be imported in subcategory "Bar category", topics from phpBB
# category 4 will be merged into category 5 and category 6 will be
# skipped.
#
# category_mappings:
# 1: foo
# 2: foo
# 3: bar
# 4: 5
# 6: SKIP
#
category_mappings:
# Tag mappings
#
# For example, imported topics from phpBB category 1 will be tagged
# with 'first-category', etc.
#
# tag_mappings:
# 1:
# - first-category
# 2:
# - second-category
# 3:
# - third-category
#
tag_mappings:
# Rank to trust level mapping
#
# Map phpBB 3.x rank levels to trust level
# Users with rank at least 3000 will have TL3, etc.
#
# rank_mapping:
# trust_level_1: 200
# trust_level_2: 1000
# trust_level_3: 3000
#
rank_mapping:
# WARNING: Do not activate this option unless you know what you are doing.
# It will probably break the BBCode to Markdown conversion and slows down your import.
use_bbcode_to_md: false

View File

@ -0,0 +1,88 @@
# frozen_string_literal: true
module ImportScripts; end
module ImportScripts::PhpBB3; end
module ImportScripts::PhpBB3::BBCode
LINEBREAK_AUTO = :auto
LINEBREAK_HARD = :hard
LINEBREAK_HTML = :html
class MarkdownNode
# @return [String]
attr_reader :xml_node_name
# @return [MarkdownNode]
attr_reader :parent
# @return [Array<MarkdownNode>]
attr_reader :children
# @return [Array<MarkdownNode>]
attr_accessor :previous_sibling
# @return [Array<MarkdownNode>]
attr_accessor :next_sibling
# @return [String]
attr_accessor :text
# @return [String]
attr_accessor :prefix
# @return [String]
attr_accessor :postfix
# @return [Integer]
attr_accessor :prefix_linebreaks
# @return [Integer]
attr_accessor :postfix_linebreaks
# @return [Symbol]
attr_accessor :prefix_linebreak_type
# @return [Symbol]
attr_accessor :postfix_linebreak_type
# @return [String]
attr_accessor :prefix_children
# @param xml_node_name [String]
# @param parent [MarkdownNode]
def initialize(xml_node_name:, parent:)
@xml_node_name = xml_node_name
@text = +""
@prefix = +""
@postfix = +""
@prefix_linebreaks = 0
@postfix_linebreaks = 0
@prefix_linebreak_type = LINEBREAK_AUTO
@postfix_linebreak_type = LINEBREAK_AUTO
@parent = parent
@children = []
if @parent
@previous_sibling = @parent.children.last
@previous_sibling.next_sibling = self if @previous_sibling
@parent.children << self
end
end
def enclosed_with=(text)
@prefix = @postfix = text
end
def skip_children
@children = nil
end
def to_s
"name: #{xml_node_name}, prefix: #{prefix}, text: #{text}, children: #{children.size}, postfix: #{postfix}"
end
end
end

View File

@ -0,0 +1,356 @@
# frozen_string_literal: true
require 'nokogiri'
require_relative 'markdown_node'
module ImportScripts::PhpBB3::BBCode
class XmlToMarkdown
def initialize(xml, opts = {})
@username_from_user_id = opts[:username_from_user_id]
@smilie_to_emoji = opts[:smilie_to_emoji]
@quoted_post_from_post_id = opts[:quoted_post_from_post_id]
@upload_md_from_file = opts[:upload_md_from_file]
@url_replacement = opts[:url_replacement]
@allow_inline_code = opts.fetch(:allow_inline_code, false)
@traditional_linebreaks = opts.fetch(:traditional_linebreaks, false)
@doc = Nokogiri::XML(xml)
@list_stack = []
end
def convert
preprocess_xml
md_root = MarkdownNode.new(xml_node_name: "ROOT", parent: nil)
visit(@doc.root, md_root)
to_markdown(md_root).rstrip
end
private
IGNORED_ELEMENTS = ["s", "e", "i"]
ELEMENTS_WITHOUT_LEADING_WHITESPACES = ["LIST", "LI"]
ELEMENTS_WITH_HARD_LINEBREAKS = ["B", "I", "U"]
EXPLICIT_LINEBREAK_THRESHOLD = 2
def preprocess_xml
@doc.traverse do |node|
if node.is_a? Nokogiri::XML::Text
node.content = node.content.gsub(/\A\n+\s*/, "")
node.content = node.content.lstrip if remove_leading_whitespaces?(node)
node.remove if node.content.empty?
elsif IGNORED_ELEMENTS.include?(node.name)
node.remove
end
end
end
def remove_leading_whitespaces?(xml_node)
parent = xml_node.parent
return false unless parent
ELEMENTS_WITHOUT_LEADING_WHITESPACES.include?(parent.name) &&
parent.children.first == xml_node
end
def visit(xml_node, md_parent)
visitor = "visit_#{xml_node.name}"
visitor_exists = respond_to?(visitor, include_all: true)
if visitor_exists && md_parent.children
md_node = create_node(xml_node, md_parent)
send(visitor, xml_node, md_node)
end
xml_node.children.each { |xml_child| visit(xml_child, md_node || md_parent) }
after_hook = "after_#{xml_node.name}"
if respond_to?(after_hook, include_all: true)
send(after_hook, xml_node, md_node)
end
end
def create_node(xml_node, md_parent)
if xml_node.name == "br"
last_child = md_parent.children.last
return last_child if last_child&.xml_node_name == "br"
end
MarkdownNode.new(xml_node_name: xml_node.name, parent: md_parent)
end
def visit_text(xml_node, md_node)
md_node.text << text(xml_node)
end
def visit_B(xml_node, md_node)
if xml_node.parent&.name != 'B'
md_node.enclosed_with = "**"
end
end
def visit_I(xml_node, md_node)
if xml_node.parent&.name != 'I'
md_node.enclosed_with = "_"
end
end
def visit_U(xml_node, md_node)
if xml_node.parent&.name != 'U'
md_node.prefix = "[u]"
md_node.postfix = "[/u]"
end
end
def visit_CODE(xml_node, md_node)
content = xml_node.content
if !@allow_inline_code || content.include?("\n")
md_node.prefix = "```text\n"
md_node.postfix = "\n```"
else
md_node.enclosed_with = "`"
end
md_node.text = content.rstrip
md_node.skip_children
md_node.prefix_linebreaks = md_node.postfix_linebreaks = 2
md_node.prefix_linebreak_type = LINEBREAK_HTML
end
def visit_LIST(xml_node, md_node)
md_node.prefix_linebreaks = md_node.postfix_linebreaks = @list_stack.size == 0 ? 2 : 1
md_node.prefix_linebreak_type = LINEBREAK_HTML if @list_stack.size == 0
@list_stack << {
unordered: xml_node.attribute('type').nil?,
item_count: 0
}
end
def after_LIST(xml_node, md_node)
@list_stack.pop
end
def visit_LI(xml_node, md_node)
list = @list_stack.last
depth = @list_stack.size - 1
list[:item_count] += 1
indentation = ' ' * 2 * depth
symbol = list[:unordered] ? '*' : "#{list[:item_count]}."
md_node.prefix = "#{indentation}#{symbol} "
md_node.postfix_linebreaks = 1
end
def visit_IMG(xml_node, md_node)
md_node.text = +"![](#{xml_node.attribute('src')})"
md_node.prefix_linebreaks = md_node.postfix_linebreaks = 2
md_node.skip_children
end
def visit_URL(xml_node, md_node)
original_url = xml_node.attribute('url').to_s
url = CGI.unescapeHTML(original_url)
url = @url_replacement.call(url) if @url_replacement
if xml_node.content.strip == original_url
md_node.text = url
md_node.skip_children
else
md_node.prefix = "["
md_node.postfix = "](#{url})"
end
end
def visit_EMAIL(xml_node, md_node)
md_node.prefix = "<"
md_node.postfix = ">"
end
def visit_br(xml_node, md_node)
md_node.postfix_linebreaks += 1
if md_node.postfix_linebreaks > 1 && ELEMENTS_WITH_HARD_LINEBREAKS.include?(xml_node.parent&.name)
md_node.postfix_linebreak_type = LINEBREAK_HARD
end
end
def visit_E(xml_node, md_node)
if @smilie_to_emoji
md_node.text = @smilie_to_emoji.call(xml_node.content)
md_node.skip_children
end
end
def visit_QUOTE(xml_node, md_node)
if post = quoted_post(xml_node)
md_node.prefix = %Q{[quote="#{post[:username]}, post:#{post[:post_number]}, topic:#{post[:topic_id]}"]\n}
md_node.postfix = "\n[/quote]"
elsif username = quoted_username(xml_node)
md_node.prefix = %Q{[quote="#{username}"]\n}
md_node.postfix = "\n[/quote]"
else
md_node.prefix_children = "> "
end
md_node.prefix_linebreaks = md_node.postfix_linebreaks = 2
md_node.prefix_linebreak_type = LINEBREAK_HTML
end
def quoted_post(xml_node)
if @quoted_post_from_post_id
post_id = to_i(xml_node.attr("post_id"))
@quoted_post_from_post_id.call(post_id) if post_id
end
end
def quoted_username(xml_node)
if @username_from_user_id
user_id = to_i(xml_node.attr("user_id"))
username = @username_from_user_id.call(user_id) if user_id
end
username = xml_node.attr("author") unless username
username
end
def to_i(string)
string.to_i if string&.match(/\A\d+\z/)
end
def visit_ATTACHMENT(xml_node, md_node)
filename = xml_node.attr("filename")
index = to_i(xml_node.attr("index"))
md_node.text = @upload_md_from_file.call(filename, index) if @upload_md_from_file
md_node.prefix_linebreaks = md_node.postfix_linebreaks = 1
md_node.skip_children
end
def visit_SIZE(xml_node, md_node)
size = to_i(xml_node.attr("size"))
return if size.nil?
if size.between?(1, 99)
md_node.prefix = '<small>'
md_node.postfix = '</small>'
elsif size.between?(101, 200)
md_node.prefix = '<big>'
md_node.postfix = '</big>'
end
end
def text(xml_node, escape_markdown: true)
text = CGI.unescapeHTML(xml_node.text)
# text.gsub!(/[\\`*_{}\[\]()#+\-.!~]/) { |c| "\\#{c}" } if escape_markdown
text
end
# @param md_parent [MarkdownNode]
def to_markdown(md_parent)
markdown = +""
md_parent.children.each do |md_node|
prefix = md_node.prefix
text = md_node.children&.any? ? to_markdown(md_node) : md_node.text
postfix = md_node.postfix
parent_prefix = prefix_from_parent(md_parent)
if parent_prefix && md_node.xml_node_name != "br" && (md_parent.prefix_children || !markdown.empty?)
prefix = "#{parent_prefix}#{prefix}"
end
if md_node.xml_node_name != "CODE"
text, prefix, postfix = hoist_whitespaces!(markdown, text, prefix, postfix)
end
add_linebreaks!(markdown, md_node.prefix_linebreaks, md_node.prefix_linebreak_type, parent_prefix)
markdown << prefix
markdown << text
markdown << postfix
add_linebreaks!(markdown, md_node.postfix_linebreaks, md_node.postfix_linebreak_type, parent_prefix)
end
markdown
end
def hoist_whitespaces!(markdown, text, prefix, postfix)
text = text.lstrip if markdown.end_with?("\n")
unless prefix.empty?
if starts_with_whitespace?(text) && !ends_with_whitespace?(markdown)
prefix = "#{text[0]}#{prefix}"
end
text = text.lstrip
end
unless postfix.empty?
if ends_with_whitespace?(text)
postfix = "#{postfix}#{text[-1]}"
end
text = text.rstrip
end
[text, prefix, postfix]
end
def prefix_from_parent(md_parent)
while md_parent
return md_parent.prefix_children if md_parent.prefix_children
md_parent = md_parent.parent
end
end
def add_linebreaks!(markdown, required_linebreak_count, linebreak_type, prefix = nil)
return if required_linebreak_count == 0 || markdown.empty?
existing_linebreak_count = markdown[/(?:\\?\n|<br>\n)*\z/].count("\n")
if linebreak_type == LINEBREAK_HTML
max_linebreak_count = [existing_linebreak_count, required_linebreak_count - 1].max + 1
required_linebreak_count = max_linebreak_count if max_linebreak_count > EXPLICIT_LINEBREAK_THRESHOLD
end
return if existing_linebreak_count >= required_linebreak_count
rstrip!(markdown)
alternative_linebreak_start_index = required_linebreak_count > EXPLICIT_LINEBREAK_THRESHOLD ? 1 : 2
required_linebreak_count.times do |index|
linebreak = linebreak(linebreak_type, index, alternative_linebreak_start_index, required_linebreak_count)
markdown << (linebreak == "\n" ? prefix.rstrip : prefix) if prefix && index > 0
markdown << linebreak
end
end
def rstrip!(markdown)
markdown.gsub!(/\s*(?:\\?\n|<br>\n)*\z/, '')
end
def linebreak(linebreak_type, linebreak_index, alternative_linebreak_start_index, required_linebreak_count)
use_alternative_linebreak = linebreak_index >= alternative_linebreak_start_index
is_last_linebreak = linebreak_index + 1 == required_linebreak_count
return "<br>\n" if linebreak_type == LINEBREAK_HTML &&
use_alternative_linebreak && is_last_linebreak
return "\\\n" if linebreak_type == LINEBREAK_HARD ||
@traditional_linebreaks || use_alternative_linebreak
"\n"
end
def starts_with_whitespace?(text)
text.match?(/\A\s/)
end
def ends_with_whitespace?(text)
text.match?(/\s\z/)
end
end
end

View File

@ -1,14 +1,23 @@
# frozen_string_literal: true
require 'csv'
require 'yaml'
require_relative '../../base'
module ImportScripts::PhpBB3
class Settings
def self.load(filename)
yaml = YAML::load_file(filename)
Settings.new(yaml)
Settings.new(yaml.deep_stringify_keys.with_indifferent_access)
end
attr_reader :site_name
attr_reader :new_categories
attr_reader :category_mappings
attr_reader :tag_mappings
attr_reader :rank_mapping
attr_reader :import_anonymous_users
attr_reader :import_attachments
attr_reader :import_private_messages
@ -34,6 +43,14 @@ module ImportScripts::PhpBB3
def initialize(yaml)
import_settings = yaml['import']
@site_name = import_settings['site_name']
@new_categories = import_settings['new_categories']
@category_mappings = import_settings['category_mappings']
@tag_mappings = import_settings['tag_mappings']
@rank_mapping = import_settings['rank_mapping']
@import_anonymous_users = import_settings['anonymous_users']
@import_attachments = import_settings['attachments']
@import_private_messages = import_settings['private_messages']
@ -58,6 +75,20 @@ module ImportScripts::PhpBB3
@database = DatabaseSettings.new(yaml['database'])
end
def prefix(val)
@site_name.present? && val.present? ? "#{@site_name}:#{val}" : val
end
def trust_level_for_posts(rank, trust_level: 0)
if @rank_mapping.present?
@rank_mapping.each do |key, value|
trust_level = [trust_level, key.gsub('trust_level_', '').to_i].max if rank >= value
end
end
trust_level
end
end
class DatabaseSettings

View File

@ -3,10 +3,12 @@
module ImportScripts::PhpBB3
class SmileyProcessor
# @param uploader [ImportScripts::Uploader]
# @param database [ImportScripts::PhpBB3::Database_3_0 | ImportScripts::PhpBB3::Database_3_1]
# @param settings [ImportScripts::PhpBB3::Settings]
# @param phpbb_config [Hash]
def initialize(uploader, settings, phpbb_config)
def initialize(uploader, database, settings, phpbb_config)
@uploader = uploader
@database = database
@smilies_path = File.join(settings.base_dir, phpbb_config[:smilies_path])
@smiley_map = {}
@ -16,12 +18,16 @@ module ImportScripts::PhpBB3
def replace_smilies(text)
# :) is encoded as <!-- s:) --><img src="{SMILIES_PATH}/icon_e_smile.gif" alt=":)" title="Smile" /><!-- s:) -->
text.gsub!(/<!-- s(\S+) --><img src="\{SMILIES_PATH\}\/(.+?)" alt="(.*?)" title="(.*?)" \/><!-- s(?:\S+) -->/) do
smiley = $1
text.gsub!(/<!-- s(\S+) --><img src="\{SMILIES_PATH\}\/.+?" alt=".*?" title=".*?" \/><!-- s?:\S+ -->/) do
emoji($1)
end
end
@smiley_map.fetch(smiley) do
upload_smiley(smiley, $2, $3, $4) || smiley_as_text(smiley)
end
def emoji(smiley_code)
@smiley_map.fetch(smiley_code) do
smiley = @database.get_smiley(smiley_code)
emoji = upload_smiley(smiley_code, smiley[:smiley_url], smiley_code, smiley[:emotion]) if smiley
emoji || smiley_as_text(smiley_code)
end
end
@ -36,7 +42,7 @@ module ImportScripts::PhpBB3
[':o', ':-o', ':eek:'] => ':astonished:',
[':shock:'] => ':open_mouth:',
[':?', ':-?', ':???:'] => ':confused:',
['8-)', ':cool:'] => ':sunglasses:',
['8)', '8-)', ':cool:'] => ':sunglasses:',
[':lol:'] => ':laughing:',
[':x', ':-x', ':mad:'] => ':angry:',
[':P', ':-P', ':razz:'] => ':stuck_out_tongue:',

View File

@ -1,48 +1,75 @@
# frozen_string_literal: true
require_relative 'bbcode/xml_to_markdown'
module ImportScripts::PhpBB3
class TextProcessor
# @param lookup [ImportScripts::LookupContainer]
# @param database [ImportScripts::PhpBB3::Database_3_0 | ImportScripts::PhpBB3::Database_3_1]
# @param smiley_processor [ImportScripts::PhpBB3::SmileyProcessor]
# @param settings [ImportScripts::PhpBB3::Settings]
def initialize(lookup, database, smiley_processor, settings)
# @param phpbb_config [Hash]
def initialize(lookup, database, smiley_processor, settings, phpbb_config)
@lookup = lookup
@database = database
@smiley_processor = smiley_processor
@he = HTMLEntities.new
@use_xml_to_markdown = phpbb_config[:phpbb_version].start_with?('3.2')
@settings = settings
@new_site_prefix = settings.new_site_prefix
create_internal_link_regexps(settings.original_site_prefix)
end
def process_raw_text(raw)
text = raw.dup
text = CGI.unescapeHTML(text)
def process_raw_text(raw, attachments = nil)
if @use_xml_to_markdown
unreferenced_attachments = attachments&.dup
clean_bbcodes(text)
if @settings.use_bbcode_to_md
text = bbcode_to_md(text)
converter = BBCode::XmlToMarkdown.new(
raw,
username_from_user_id: lambda { |user_id| @lookup.find_username_by_import_id(user_id) },
smilie_to_emoji: lambda { |smilie| @smiley_processor.emoji(smilie).dup },
quoted_post_from_post_id: lambda { |post_id| @lookup.topic_lookup_from_imported_post_id(post_id) },
upload_md_from_file: (lambda do |filename, index|
unreferenced_attachments[index] = nil
attachments.fetch(index, filename).dup
end if attachments),
url_replacement: nil,
allow_inline_code: false
)
text = converter.convert
text.gsub!(@short_internal_link_regexp) do |link|
replace_internal_link(link, $1, $2)
end
add_unreferenced_attachments(text, unreferenced_attachments)
else
text = raw.dup
text = CGI.unescapeHTML(text)
clean_bbcodes(text)
if @settings.use_bbcode_to_md
text = bbcode_to_md(text)
end
process_smilies(text)
process_links(text)
process_lists(text)
process_code(text)
fix_markdown(text)
process_attachments(text, attachments) if attachments.present?
text
end
process_smilies(text)
process_links(text)
process_lists(text)
process_code(text)
fix_markdown(text)
text
end
def process_post(raw, attachments)
text = process_raw_text(raw)
text = process_attachments(text, attachments) if attachments.present?
text
process_raw_text(raw, attachments) rescue raw
end
def process_private_msg(raw, attachments)
text = process_raw_text(raw)
text = process_attachments(text, attachments) if attachments.present?
text
process_raw_text(raw, attachments) rescue raw
end
protected
@ -139,6 +166,12 @@ module ImportScripts::PhpBB3
attachments.fetch(index, real_filename)
end
add_unreferenced_attachments(text, unreferenced_attachments)
end
def add_unreferenced_attachments(text, unreferenced_attachments)
return text unless unreferenced_attachments
unreferenced_attachments = unreferenced_attachments.compact
text << "\n" << unreferenced_attachments.join("\n") unless unreferenced_attachments.empty?
text
@ -161,6 +194,7 @@ module ImportScripts::PhpBB3
def fix_markdown(text)
text.gsub!(/(\n*\[\/?quote.*?\]\n*)/mi) { |q| "\n#{q.strip}\n" }
text.gsub!(/^!\[[^\]]*\]\([^\]]*\)$/i) { |img| "\n#{img.strip}\n" } # space out images single on line
text
end
end