mirror of
https://github.com/discourse/discourse.git
synced 2025-05-23 07:11:17 +08:00
Improved mailing list import.
Now uses a SQLite database to store messages rather than JSON files for performance and memory considerations.
This commit is contained in:
@ -198,10 +198,14 @@ class ImportScripts::Base
|
|||||||
def all_records_exist?(type, import_ids)
|
def all_records_exist?(type, import_ids)
|
||||||
return false if import_ids.empty?
|
return false if import_ids.empty?
|
||||||
|
|
||||||
Post.exec_sql('CREATE TEMP TABLE import_ids(val varchar(200) PRIMARY KEY)')
|
orig_conn = ActiveRecord::Base.connection
|
||||||
|
conn = orig_conn.raw_connection
|
||||||
|
|
||||||
|
conn.exec('CREATE TEMP TABLE import_ids(val varchar(200) PRIMARY KEY)')
|
||||||
|
|
||||||
import_id_clause = import_ids.map { |id| "('#{PG::Connection.escape_string(id.to_s)}')" }.join(",")
|
import_id_clause = import_ids.map { |id| "('#{PG::Connection.escape_string(id.to_s)}')" }.join(",")
|
||||||
Post.exec_sql("INSERT INTO import_ids VALUES #{import_id_clause}")
|
|
||||||
|
conn.exec("INSERT INTO import_ids VALUES #{import_id_clause}")
|
||||||
|
|
||||||
existing = "#{type.to_s.classify}CustomField".constantize.where(name: 'import_id')
|
existing = "#{type.to_s.classify}CustomField".constantize.where(name: 'import_id')
|
||||||
existing = existing.joins('JOIN import_ids ON val = value')
|
existing = existing.joins('JOIN import_ids ON val = value')
|
||||||
@ -211,7 +215,7 @@ class ImportScripts::Base
|
|||||||
return true
|
return true
|
||||||
end
|
end
|
||||||
ensure
|
ensure
|
||||||
Post.exec_sql('DROP TABLE import_ids')
|
conn.exec('DROP TABLE import_ids')
|
||||||
end
|
end
|
||||||
|
|
||||||
# Iterate through a list of user records to be imported.
|
# Iterate through a list of user records to be imported.
|
||||||
@ -366,7 +370,7 @@ class ImportScripts::Base
|
|||||||
params[:parent_category_id] = top.id if top
|
params[:parent_category_id] = top.id if top
|
||||||
end
|
end
|
||||||
|
|
||||||
new_category = create_category(params, params[:id])
|
create_category(params, params[:id])
|
||||||
|
|
||||||
created += 1
|
created += 1
|
||||||
end
|
end
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
require 'sqlite3'
|
||||||
require File.expand_path(File.dirname(__FILE__) + "/base.rb")
|
require File.expand_path(File.dirname(__FILE__) + "/base.rb")
|
||||||
|
|
||||||
class ImportScripts::Mbox < ImportScripts::Base
|
class ImportScripts::Mbox < ImportScripts::Base
|
||||||
@ -5,96 +6,169 @@ class ImportScripts::Mbox < ImportScripts::Base
|
|||||||
|
|
||||||
BATCH_SIZE = 1000
|
BATCH_SIZE = 1000
|
||||||
CATEGORY_ID = 6
|
CATEGORY_ID = 6
|
||||||
MBOX_DIR = "/tmp/mbox-input"
|
MBOX_DIR = File.expand_path("~/import/site")
|
||||||
USER_INDEX_PATH = "#{MBOX_DIR}/user-index.json"
|
|
||||||
TOPIC_INDEX_PATH = "#{MBOX_DIR}/topic-index.json"
|
# Remove to not split individual files
|
||||||
REPLY_INDEX_PATH = "#{MBOX_DIR}/replies-index.json"
|
SPLIT_AT = /^From owner-/
|
||||||
|
|
||||||
def execute
|
def execute
|
||||||
create_indices
|
create_email_indices
|
||||||
|
create_user_indices
|
||||||
|
massage_indices
|
||||||
import_users
|
import_users
|
||||||
create_forum_topics
|
create_forum_topics
|
||||||
import_replies
|
import_replies
|
||||||
end
|
end
|
||||||
|
|
||||||
def all_messages
|
def open_db
|
||||||
|
SQLite3::Database.new("#{MBOX_DIR}/index.db")
|
||||||
|
end
|
||||||
|
|
||||||
files = Dir["#{MBOX_DIR}/*/*"]
|
def all_messages
|
||||||
|
files = Dir["#{MBOX_DIR}/messages/*"]
|
||||||
|
|
||||||
files.each_with_index do |f, idx|
|
files.each_with_index do |f, idx|
|
||||||
|
if SPLIT_AT.present?
|
||||||
|
msg = ""
|
||||||
|
File.foreach(f).with_index do |line, line_num|
|
||||||
|
line = line.scrub
|
||||||
|
if line =~ SPLIT_AT
|
||||||
|
if !msg.empty?
|
||||||
|
mail = Mail.read_from_string(msg)
|
||||||
|
yield mail
|
||||||
|
print_status(idx, files.size)
|
||||||
|
msg = ""
|
||||||
|
end
|
||||||
|
end
|
||||||
|
msg << line
|
||||||
|
end
|
||||||
|
if !msg.empty?
|
||||||
|
mail = Mail.read_from_string(msg)
|
||||||
|
yield mail
|
||||||
|
print_status(idx, files.size)
|
||||||
|
msg = ""
|
||||||
|
end
|
||||||
|
else
|
||||||
raw = File.read(f)
|
raw = File.read(f)
|
||||||
mail = Mail.read_from_string(raw)
|
mail = Mail.read_from_string(raw)
|
||||||
yield mail, f
|
yield mail
|
||||||
print_status(idx, files.size)
|
print_status(idx, files.size)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
def create_indices
|
def massage_indices
|
||||||
return if File.exist?(USER_INDEX_PATH) && File.exist?(TOPIC_INDEX_PATH) && File.exist?(REPLY_INDEX_PATH)
|
db = open_db
|
||||||
puts "", "creating indices"
|
db.execute "UPDATE emails SET reply_to = null WHERE reply_to = ''"
|
||||||
users = {}
|
|
||||||
|
|
||||||
topics = []
|
rows = db.execute "SELECT msg_id, title, reply_to FROM emails ORDER BY email_date ASC"
|
||||||
|
|
||||||
topic_lookup = {}
|
msg_ids = {}
|
||||||
topic_titles = {}
|
titles = {}
|
||||||
replies = []
|
rows.each do |row|
|
||||||
|
msg_ids[row[0]] = true
|
||||||
|
titles[row[1]] = row[0]
|
||||||
|
end
|
||||||
|
|
||||||
all_messages do |mail, filename|
|
# First, any replies where the parent doesn't exist should have that field cleared
|
||||||
users[mail.from.first] = mail[:from].display_names.first
|
not_found = []
|
||||||
|
rows.each do |row|
|
||||||
msg_id = mail['Message-ID'].to_s
|
msg_id, _, reply_to = row
|
||||||
reply_to = mail['In-Reply-To'].to_s
|
|
||||||
title = clean_title(mail['Subject'].to_s)
|
|
||||||
date = Time.parse(mail['date'].to_s).to_i
|
|
||||||
|
|
||||||
if reply_to.present?
|
if reply_to.present?
|
||||||
topic = topic_lookup[reply_to] || reply_to
|
not_found << msg_id if msg_ids[reply_to].blank?
|
||||||
topic_lookup[msg_id] = topic
|
|
||||||
replies << {id: msg_id, topic: topic, file: filename, title: title, date: date}
|
|
||||||
else
|
|
||||||
topics << {id: msg_id, file: filename, title: title, date: date}
|
|
||||||
topic_titles[title] ||= msg_id
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
replies.sort! {|a, b| a[:date] <=> b[:date]}
|
puts "#{not_found.size} records couldn't be associated with parents"
|
||||||
topics.sort! {|a, b| a[:date] <=> b[:date]}
|
if not_found.present?
|
||||||
|
db.execute "UPDATE emails SET reply_to = NULL WHERE msg_id IN (#{not_found.map {|nf| "'#{nf}'"}.join(',')})"
|
||||||
# Replies without parents should be hoisted to topics
|
|
||||||
to_hoist = []
|
|
||||||
replies.each do |r|
|
|
||||||
to_hoist << r if !topic_lookup[r[:topic]]
|
|
||||||
end
|
end
|
||||||
|
|
||||||
to_hoist.each do |h|
|
dupe_titles = db.execute "SELECT title, COUNT(*) FROM emails GROUP BY title HAVING count(*) > 1"
|
||||||
replies.delete(h)
|
puts "#{dupe_titles.size} replies to wire up"
|
||||||
topics << {id: h[:id], file: h[:file], title: h[:title], date: h[:date]}
|
dupe_titles.each do |t|
|
||||||
topic_titles[h[:title]] ||= h[:id]
|
title = t[0]
|
||||||
|
first = titles[title]
|
||||||
|
db.execute "UPDATE emails SET reply_to = ? WHERE title = ? and msg_id <> ?", [first, title, first]
|
||||||
end
|
end
|
||||||
|
|
||||||
# Topics with duplicate replies should be replies
|
ensure
|
||||||
to_group = []
|
db.close
|
||||||
topics.each do |t|
|
|
||||||
first = topic_titles[t[:title]]
|
|
||||||
to_group << t if first && first != t[:id]
|
|
||||||
end
|
end
|
||||||
|
|
||||||
to_group.each do |t|
|
def create_email_indices
|
||||||
topics.delete(t)
|
db = open_db
|
||||||
replies << {id: t[:id], topic: topic_titles[t[:title]], file: t[:file], title: t[:title], date: t[:date]}
|
db.execute "DROP TABLE IF EXISTS emails"
|
||||||
|
db.execute <<-SQL
|
||||||
|
CREATE TABLE emails (
|
||||||
|
msg_id VARCHAR(995) PRIMARY KEY,
|
||||||
|
from_email VARCHAR(255) NOT NULL,
|
||||||
|
from_name VARCHAR(255) NOT NULL,
|
||||||
|
title VARCHAR(255) NOT NULL,
|
||||||
|
reply_to VARCHAR(955) NULL,
|
||||||
|
email_date DATETIME NOT NULL,
|
||||||
|
message TEXT NOT NULL
|
||||||
|
);
|
||||||
|
SQL
|
||||||
|
|
||||||
|
db.execute "CREATE INDEX by_title ON emails (title)"
|
||||||
|
db.execute "CREATE INDEX by_email ON emails (from_email)"
|
||||||
|
|
||||||
|
puts "", "creating indices"
|
||||||
|
|
||||||
|
all_messages do |mail|
|
||||||
|
msg_id = mail['Message-ID'].to_s
|
||||||
|
|
||||||
|
# Many ways to get a name
|
||||||
|
from = mail[:from]
|
||||||
|
from_name = nil
|
||||||
|
|
||||||
|
from_email = nil
|
||||||
|
if mail.from.present?
|
||||||
|
from_email = mail.from.first
|
||||||
end
|
end
|
||||||
|
|
||||||
replies.sort! {|a, b| a[:date] <=> b[:date]}
|
display_names = from.try(:display_names)
|
||||||
topics.sort! {|a, b| a[:date] <=> b[:date]}
|
if display_names.present?
|
||||||
|
from_name = display_names.first
|
||||||
|
end
|
||||||
|
|
||||||
|
if from_name.blank? && from.to_s =~ /\(([^\)]+)\)/
|
||||||
|
from_name = Regexp.last_match[1]
|
||||||
|
end
|
||||||
|
from_name = from.to_s if from_name.blank?
|
||||||
|
|
||||||
File.write(USER_INDEX_PATH, {users: users}.to_json)
|
title = clean_title(mail['Subject'].to_s)
|
||||||
File.write(TOPIC_INDEX_PATH, {topics: topics}.to_json)
|
reply_to = mail['In-Reply-To'].to_s
|
||||||
File.write(REPLY_INDEX_PATH, {replies: replies}.to_json)
|
email_date = mail['date'].to_s
|
||||||
|
|
||||||
|
db.execute "INSERT OR IGNORE INTO emails (msg_id, from_email, from_name, title, reply_to, email_date, message)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?, ?)",
|
||||||
|
[msg_id, from_email, from_name, title, reply_to, email_date, mail.to_s]
|
||||||
|
end
|
||||||
|
ensure
|
||||||
|
db.close
|
||||||
|
end
|
||||||
|
|
||||||
|
def create_user_indices
|
||||||
|
db = open_db
|
||||||
|
db.execute "DROP TABLE IF EXISTS users"
|
||||||
|
db.execute <<-SQL
|
||||||
|
CREATE TABLE users (
|
||||||
|
email VARCHAR(995) PRIMARY KEY,
|
||||||
|
name VARCHAR(255) NOT NULL
|
||||||
|
);
|
||||||
|
SQL
|
||||||
|
|
||||||
|
db.execute "INSERT OR IGNORE INTO users (email, name) SELECT from_email, from_name FROM emails"
|
||||||
|
ensure
|
||||||
|
db.close
|
||||||
end
|
end
|
||||||
|
|
||||||
def clean_title(title)
|
def clean_title(title)
|
||||||
|
title ||= ""
|
||||||
#Strip mailing list name from subject
|
#Strip mailing list name from subject
|
||||||
title = title.gsub(/\[[^\]]+\]+/, '').strip
|
title = title.gsub(/\[[^\]]+\]+/, '').strip
|
||||||
|
|
||||||
@ -125,24 +199,26 @@ class ImportScripts::Mbox < ImportScripts::Base
|
|||||||
|
|
||||||
def import_users
|
def import_users
|
||||||
puts "", "importing users"
|
puts "", "importing users"
|
||||||
|
db = open_db
|
||||||
|
|
||||||
all_users = ::JSON.parse(File.read(USER_INDEX_PATH))['users']
|
all_users = db.execute("SELECT name, email FROM users")
|
||||||
user_keys = all_users.keys
|
total_count = all_users.size
|
||||||
total_count = user_keys.size
|
|
||||||
|
|
||||||
batches(BATCH_SIZE) do |offset|
|
batches(BATCH_SIZE) do |offset|
|
||||||
users = user_keys[offset..offset+BATCH_SIZE-1]
|
users = all_users[offset..offset+BATCH_SIZE-1]
|
||||||
break if users.nil?
|
break if users.nil?
|
||||||
next if all_records_exist? :users, users
|
next if all_records_exist? :users, users.map {|u| u[1]}
|
||||||
|
|
||||||
create_users(users, total: total_count, offset: offset) do |email|
|
create_users(users, total: total_count, offset: offset) do |u|
|
||||||
{
|
{
|
||||||
id: email,
|
id: u[1],
|
||||||
email: email,
|
email: u[1],
|
||||||
name: all_users[email]
|
name: u[0]
|
||||||
}
|
}
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
ensure
|
||||||
|
db.close
|
||||||
end
|
end
|
||||||
|
|
||||||
def parse_email(msg)
|
def parse_email(msg)
|
||||||
@ -157,23 +233,33 @@ class ImportScripts::Mbox < ImportScripts::Base
|
|||||||
def create_forum_topics
|
def create_forum_topics
|
||||||
puts "", "creating forum topics"
|
puts "", "creating forum topics"
|
||||||
|
|
||||||
all_topics = ::JSON.parse(File.read(TOPIC_INDEX_PATH))['topics']
|
db = open_db
|
||||||
|
all_topics = db.execute("SELECT msg_id,
|
||||||
|
from_email,
|
||||||
|
from_name,
|
||||||
|
title,
|
||||||
|
email_date,
|
||||||
|
message
|
||||||
|
FROM emails
|
||||||
|
WHERE reply_to IS NULL")
|
||||||
|
|
||||||
topic_count = all_topics.size
|
topic_count = all_topics.size
|
||||||
|
|
||||||
batches(BATCH_SIZE) do |offset|
|
batches(BATCH_SIZE) do |offset|
|
||||||
topics = all_topics[offset..offset+BATCH_SIZE-1]
|
topics = all_topics[offset..offset+BATCH_SIZE-1]
|
||||||
break if topics.nil?
|
break if topics.nil?
|
||||||
|
|
||||||
next if all_records_exist? :posts, topics.map {|t| t['id']}
|
next if all_records_exist? :posts, topics.map {|t| t[0]}
|
||||||
|
|
||||||
create_posts(topics, total: topic_count, offset: offset) do |t|
|
create_posts(topics, total: topic_count, offset: offset) do |t|
|
||||||
raw_email = File.read(t['file'])
|
raw_email = t[5]
|
||||||
receiver = Email::Receiver.new(raw_email)
|
receiver = Email::Receiver.new(raw_email)
|
||||||
mail = Mail.read_from_string(raw_email)
|
mail = Mail.read_from_string(raw_email)
|
||||||
mail.body
|
mail.body
|
||||||
|
|
||||||
selected = receiver.select_body
|
selected = receiver.select_body
|
||||||
next unless selected
|
next unless selected
|
||||||
|
selected = selected.join('') if selected.kind_of?(Array)
|
||||||
|
|
||||||
raw = selected.force_encoding(selected.encoding).encode("UTF-8")
|
raw = selected.force_encoding(selected.encoding).encode("UTF-8")
|
||||||
|
|
||||||
@ -195,7 +281,7 @@ class ImportScripts::Mbox < ImportScripts::Base
|
|||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
{ id: t['id'],
|
{ id: t[0],
|
||||||
title: clean_title(title),
|
title: clean_title(title),
|
||||||
user_id: user_id_from_imported_user_id(mail.from.first) || Discourse::SYSTEM_USER_ID,
|
user_id: user_id_from_imported_user_id(mail.from.first) || Discourse::SYSTEM_USER_ID,
|
||||||
created_at: mail.date,
|
created_at: mail.date,
|
||||||
@ -204,34 +290,49 @@ class ImportScripts::Mbox < ImportScripts::Base
|
|||||||
cook_method: Post.cook_methods[:email] }
|
cook_method: Post.cook_methods[:email] }
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
ensure
|
||||||
|
db.close
|
||||||
end
|
end
|
||||||
|
|
||||||
def import_replies
|
def import_replies
|
||||||
puts "", "creating topic replies"
|
puts "", "creating topic replies"
|
||||||
|
|
||||||
replies = ::JSON.parse(File.read(REPLY_INDEX_PATH))['replies']
|
db = open_db
|
||||||
|
replies = db.execute("SELECT msg_id,
|
||||||
|
from_email,
|
||||||
|
from_name,
|
||||||
|
title,
|
||||||
|
email_date,
|
||||||
|
message,
|
||||||
|
reply_to
|
||||||
|
FROM emails
|
||||||
|
WHERE reply_to IS NOT NULL")
|
||||||
|
|
||||||
post_count = replies.size
|
post_count = replies.size
|
||||||
|
|
||||||
batches(BATCH_SIZE) do |offset|
|
batches(BATCH_SIZE) do |offset|
|
||||||
posts = replies[offset..offset+BATCH_SIZE-1]
|
posts = replies[offset..offset+BATCH_SIZE-1]
|
||||||
break if posts.nil?
|
break if posts.nil?
|
||||||
|
|
||||||
next if all_records_exist? :posts, posts.map {|p| p['id']}
|
next if all_records_exist? :posts, posts.map {|p| p[0]}
|
||||||
|
|
||||||
create_posts(posts, total: post_count, offset: offset) do |p|
|
create_posts(posts, total: post_count, offset: offset) do |p|
|
||||||
parent_id = p['topic']
|
parent_id = p[6]
|
||||||
id = p['id']
|
id = p[0]
|
||||||
|
|
||||||
topic = topic_lookup_from_imported_post_id(parent_id)
|
topic = topic_lookup_from_imported_post_id(parent_id)
|
||||||
topic_id = topic[:topic_id] if topic
|
topic_id = topic[:topic_id] if topic
|
||||||
next unless topic_id
|
next unless topic_id
|
||||||
|
|
||||||
raw_email = File.read(p['file'])
|
raw_email = p[5]
|
||||||
receiver = Email::Receiver.new(raw_email)
|
receiver = Email::Receiver.new(raw_email)
|
||||||
mail = Mail.read_from_string(raw_email)
|
mail = Mail.read_from_string(raw_email)
|
||||||
mail.body
|
mail.body
|
||||||
|
|
||||||
selected = receiver.select_body
|
selected = receiver.select_body
|
||||||
|
selected = selected.join('') if selected.kind_of?(Array)
|
||||||
|
next unless selected
|
||||||
|
|
||||||
raw = selected.force_encoding(selected.encoding).encode("UTF-8")
|
raw = selected.force_encoding(selected.encoding).encode("UTF-8")
|
||||||
|
|
||||||
# import the attachments
|
# import the attachments
|
||||||
@ -258,6 +359,8 @@ class ImportScripts::Mbox < ImportScripts::Base
|
|||||||
cook_method: Post.cook_methods[:email] }
|
cook_method: Post.cook_methods[:email] }
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
ensure
|
||||||
|
db.close
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user