Replace rmmseg gem for cppjieba_rb since better dictionary (#5006)

* Rename locale to ts config in search module to make it clear

* Replace rmmese-cpp for cppjieba_rb
This commit is contained in:
Erick Guan
2017-07-31 21:28:48 +02:00
committed by Sam
parent 6c997b65d9
commit f1eccd113c
5 changed files with 58 additions and 42 deletions

View File

@ -20,13 +20,15 @@ class Search
%w(topic category user private_messages)
end
def self.long_locale
# if adding a language see:
# /usr/share/postgresql/9.3/tsearch_data for possible options
# Do not add languages that are missing without amending the
def self.ts_config(locale = SiteSetting.default_locale)
# if adding a text search configuration, you should check PG beforehand:
# SELECT cfgname FROM pg_ts_config;
# As an aside, dictionaries can be listed by `\dFd`, the
# physical locations are in /usr/share/postgresql/<version>/tsearch_data.
# But it may not appear there based on pg extension configuration.
# base docker config
#
case SiteSetting.default_locale.to_sym
case locale.to_sym
when :da then 'danish'
when :de then 'german'
when :en then 'english'
@ -39,7 +41,7 @@ class Search
when :pt_BR then 'portuguese'
when :sv then 'swedish'
when :ru then 'russian'
else 'simple' # use the 'simple' stemmer for other languages
else 'simple' # use the 'simple' stemmer for other languages
end
end
@ -73,21 +75,16 @@ class Search
nil
end
def self.prepare_data(search_data)
def self.prepare_data(search_data, purpose = :query)
data = search_data.squish
# TODO rmmseg is designed for chinese, we need something else for Korean / Japanese
# TODO cppjieba_rb is designed for chinese, we need something else for Korean / Japanese
if ['zh_TW', 'zh_CN', 'ja', 'ko'].include?(SiteSetting.default_locale) || SiteSetting.search_tokenize_chinese_japanese_korean
unless defined? RMMSeg
require 'rmmseg'
RMMSeg::Dictionary.load_dictionaries
end
algo = RMMSeg::Algorithm.new(search_data)
data = ""
while token = algo.next_token
data << token.text << " "
unless defined? CppjiebaRb
require 'cppjieba_rb'
end
mode = (purpose == :query ? :query : :mix)
data = CppjiebaRb.segment(search_data, mode: mode)
data = CppjiebaRb.filter_stop_word(data).join(' ')
end
data.force_encoding("UTF-8")
@ -477,7 +474,7 @@ class Search
FROM topic_tags tt, tags
WHERE tt.tag_id = tags.id
GROUP BY tt.topic_id
HAVING to_tsvector(#{query_locale}, array_to_string(array_agg(tags.name), ' ')) @@ to_tsquery(#{query_locale}, ?)
HAVING to_tsvector(#{default_ts_config}, array_to_string(array_agg(tags.name), ' ')) @@ to_tsquery(#{default_ts_config}, ?)
)", tags.join('&'))
else
tags = match.split(",")
@ -678,7 +675,6 @@ class Search
posts = posts.where("posts.raw || ' ' || u.username || ' ' || COALESCE(u.name, '') ilike ?", "%#{term_without_quote}%")
else
posts = posts.where("post_search_data.search_data @@ #{ts_query}")
exact_terms = @term.scan(/"([^"]+)"/).flatten
exact_terms.each do |exact|
posts = posts.where("posts.raw ilike ?", "%#{exact}%")
@ -740,7 +736,7 @@ class Search
posts = posts.order("posts.like_count DESC")
end
else
posts = posts.order("TS_RANK_CD(TO_TSVECTOR(#{query_locale}, topics.title), #{ts_query}) DESC")
posts = posts.order("TS_RANK_CD(TO_TSVECTOR(#{default_ts_config}, topics.title), #{ts_query}) DESC")
data_ranking = "TS_RANK_CD(post_search_data.search_data, #{ts_query})"
if opts[:aggregate_search]
@ -760,34 +756,33 @@ class Search
posts.limit(limit)
end
def self.query_locale
"'#{Search.long_locale}'"
def self.default_ts_config
"'#{Search.ts_config}'"
end
def query_locale
self.class.query_locale
def default_ts_config
self.class.default_ts_config
end
def self.ts_query(term, locale = nil, joiner = "&")
def self.ts_query(term, ts_config = nil, joiner = "&")
data = Post.exec_sql("SELECT to_tsvector(:locale, :term)",
locale: 'simple',
term: term
).values[0][0]
data = Post.exec_sql("SELECT TO_TSVECTOR(:config, :term)",
config: 'simple',
term: term).values[0][0]
locale = Post.sanitize(locale) if locale
ts_config = Post.sanitize(ts_config) if ts_config
all_terms = data.scan(/'([^']+)'\:\d+/).flatten
all_terms.map! do |t|
t.split(/[\)\(&']/)[0]
end.compact!
query = Post.sanitize(all_terms.map { |t| "'#{PG::Connection.escape_string(t)}':*" }.join(" #{joiner} "))
"TO_TSQUERY(#{locale || query_locale}, #{query})"
"TO_TSQUERY(#{ts_config || default_ts_config}, #{query})"
end
def ts_query(locale = nil)
def ts_query(ts_config = nil)
@ts_query_cache ||= {}
@ts_query_cache[(locale || query_locale) + " " + @term] ||= Search.ts_query(@term, locale)
@ts_query_cache["#{ts_config || default_ts_config} #{@term}"] ||= Search.ts_query(@term, ts_config)
end
def wrap_rows(query)