FEATURE: Split up text segmentation for Chinese and Japanese.

* Chinese segmenetation will continue to rely on cppjieba
* Japanese segmentation will use our port of TinySegmenter
* Korean currently does not rely on segmentation which was dropped in c677877e4fe5381f613279901f36ae255c909573
* SiteSetting.search_tokenize_chinese_japanese_korean has been split
into SiteSetting.search_tokenize_chinese and
SiteSetting.search_tokenize_japanese respectively
This commit is contained in:
Alan Guo Xiang Tan
2022-01-26 15:24:11 +08:00
parent 9ddd1f739e
commit 930f51e175
14 changed files with 406 additions and 72 deletions

View File

@ -64,9 +64,17 @@ class Search
end
end
def self.segment_cjk?
['zh_TW', 'zh_CN', 'ja'].include?(SiteSetting.default_locale) ||
SiteSetting.search_tokenize_chinese_japanese_korean
def self.segment_chinese?
['zh_TW', 'zh_CN'].include?(SiteSetting.default_locale) || SiteSetting.search_tokenize_chinese
end
def self.segment_japanese?
SiteSetting.default_locale == "ja" || SiteSetting.search_tokenize_japanese
end
def self.japanese_punctuation_regexp
# Regexp adapted from https://github.com/6/tiny_segmenter/blob/15a5b825993dfd2c662df3766f232051716bef5b/lib/tiny_segmenter.rb#L7
@japanese_punctuation_regexp ||= Regexp.compile("[-–—―.。・()()[]{}{}【】⟨⟩、、,,،…‥〽「」『』〜~!!::??\"'|__“”‘’;/⁄/«»]")
end
def self.prepare_data(search_data, purpose = nil)
@ -74,22 +82,35 @@ class Search
data.force_encoding("UTF-8")
if purpose != :topic
# TODO cppjieba_rb is designed for chinese, we need something else for Japanese
# Korean appears to be safe cause words are already space separated
# For Japanese we should investigate using kakasi
if segment_cjk?
if segment_chinese?
require 'cppjieba_rb' unless defined? CppjiebaRb
data = CppjiebaRb.segment(search_data, mode: :mix)
# TODO: we still want to tokenize here but the current stopword list is too wide
# in cppjieba leading to words such as volume to be skipped. PG already has an English
# stopword list so use that vs relying on cppjieba
if ts_config != 'english'
data = CppjiebaRb.filter_stop_word(data)
else
data = data.filter { |s| s.present? }
segmented_data = []
# We need to split up the string here because Cppjieba has a bug where text starting with numeric chars will
# be split into two segments. For example, '123abc' becomes '123' and 'abc' after segmentation.
data.scan(/(?<chinese>[\p{Han}。,、“”《》…\.:?!;()]+)|([^\p{Han}]+)/) do
match_data = $LAST_MATCH_INFO
if match_data[:chinese]
segments = CppjiebaRb.segment(match_data.to_s, mode: :mix)
if ts_config != 'english'
segments = CppjiebaRb.filter_stop_word(segments)
end
segments = segments.filter { |s| s.present? }
segmented_data << segments.join(' ')
else
segmented_data << match_data.to_s.squish
end
end
data = segmented_data.join(' ')
elsif segment_japanese?
data.gsub!(japanese_punctuation_regexp, " ")
data = TinyJapaneseSegmenter.segment(data)
data = data.filter { |s| s.present? }
data = data.join(' ')
else
data.squish!
@ -263,7 +284,7 @@ class Search
end
unless @filters.present? || @opts[:search_for_id]
min_length = @opts[:min_search_term_length] || SiteSetting.min_search_term_length
min_length = min_search_term_length
terms = (@term || '').split(/\s(?=(?:[^"]|"[^"]*")*$)/).reject { |t| t.length < min_length }
if terms.blank?
@ -571,7 +592,7 @@ class Search
SQL
# a bit yucky but we got to add the term back in
elsif match.to_s.length >= SiteSetting.min_search_term_length
elsif match.to_s.length >= min_search_term_length
posts.where <<~SQL
posts.id IN (
SELECT post_id FROM post_search_data pd1
@ -1304,4 +1325,18 @@ class Search
!readonly_mode &&
@opts[:type_filter] != "exclude_topics"
end
def min_search_term_length
return @opts[:min_search_term_length] if @opts[:min_search_term_length]
if SiteSetting.search_tokenize_chinese
return SiteSetting.defaults.get('min_search_term_length', 'zh_CN')
end
if SiteSetting.search_tokenize_japanese
return SiteSetting.defaults.get('min_search_term_length', 'ja')
end
SiteSetting.min_search_term_length
end
end