mirror of
https://github.com/discourse/discourse.git
synced 2025-05-31 07:19:40 +08:00
FEATURE: Split up text segmentation for Chinese and Japanese.
* Chinese segmenetation will continue to rely on cppjieba * Japanese segmentation will use our port of TinySegmenter * Korean currently does not rely on segmentation which was dropped in c677877e4fe5381f613279901f36ae255c909573 * SiteSetting.search_tokenize_chinese_japanese_korean has been split into SiteSetting.search_tokenize_chinese and SiteSetting.search_tokenize_japanese respectively
This commit is contained in:
@ -64,9 +64,17 @@ class Search
|
||||
end
|
||||
end
|
||||
|
||||
def self.segment_cjk?
|
||||
['zh_TW', 'zh_CN', 'ja'].include?(SiteSetting.default_locale) ||
|
||||
SiteSetting.search_tokenize_chinese_japanese_korean
|
||||
def self.segment_chinese?
|
||||
['zh_TW', 'zh_CN'].include?(SiteSetting.default_locale) || SiteSetting.search_tokenize_chinese
|
||||
end
|
||||
|
||||
def self.segment_japanese?
|
||||
SiteSetting.default_locale == "ja" || SiteSetting.search_tokenize_japanese
|
||||
end
|
||||
|
||||
def self.japanese_punctuation_regexp
|
||||
# Regexp adapted from https://github.com/6/tiny_segmenter/blob/15a5b825993dfd2c662df3766f232051716bef5b/lib/tiny_segmenter.rb#L7
|
||||
@japanese_punctuation_regexp ||= Regexp.compile("[-–—―.。・()()[]{}{}【】⟨⟩、、,,،…‥〽「」『』〜~!!::??\"'|__“”‘’;/⁄/«»]")
|
||||
end
|
||||
|
||||
def self.prepare_data(search_data, purpose = nil)
|
||||
@ -74,22 +82,35 @@ class Search
|
||||
data.force_encoding("UTF-8")
|
||||
|
||||
if purpose != :topic
|
||||
# TODO cppjieba_rb is designed for chinese, we need something else for Japanese
|
||||
# Korean appears to be safe cause words are already space separated
|
||||
# For Japanese we should investigate using kakasi
|
||||
if segment_cjk?
|
||||
if segment_chinese?
|
||||
require 'cppjieba_rb' unless defined? CppjiebaRb
|
||||
data = CppjiebaRb.segment(search_data, mode: :mix)
|
||||
|
||||
# TODO: we still want to tokenize here but the current stopword list is too wide
|
||||
# in cppjieba leading to words such as volume to be skipped. PG already has an English
|
||||
# stopword list so use that vs relying on cppjieba
|
||||
if ts_config != 'english'
|
||||
data = CppjiebaRb.filter_stop_word(data)
|
||||
else
|
||||
data = data.filter { |s| s.present? }
|
||||
segmented_data = []
|
||||
|
||||
# We need to split up the string here because Cppjieba has a bug where text starting with numeric chars will
|
||||
# be split into two segments. For example, '123abc' becomes '123' and 'abc' after segmentation.
|
||||
data.scan(/(?<chinese>[\p{Han}。,、“”《》…\.:?!;()]+)|([^\p{Han}]+)/) do
|
||||
match_data = $LAST_MATCH_INFO
|
||||
|
||||
if match_data[:chinese]
|
||||
segments = CppjiebaRb.segment(match_data.to_s, mode: :mix)
|
||||
|
||||
if ts_config != 'english'
|
||||
segments = CppjiebaRb.filter_stop_word(segments)
|
||||
end
|
||||
|
||||
segments = segments.filter { |s| s.present? }
|
||||
segmented_data << segments.join(' ')
|
||||
else
|
||||
segmented_data << match_data.to_s.squish
|
||||
end
|
||||
end
|
||||
|
||||
data = segmented_data.join(' ')
|
||||
elsif segment_japanese?
|
||||
data.gsub!(japanese_punctuation_regexp, " ")
|
||||
data = TinyJapaneseSegmenter.segment(data)
|
||||
data = data.filter { |s| s.present? }
|
||||
data = data.join(' ')
|
||||
else
|
||||
data.squish!
|
||||
@ -263,7 +284,7 @@ class Search
|
||||
end
|
||||
|
||||
unless @filters.present? || @opts[:search_for_id]
|
||||
min_length = @opts[:min_search_term_length] || SiteSetting.min_search_term_length
|
||||
min_length = min_search_term_length
|
||||
terms = (@term || '').split(/\s(?=(?:[^"]|"[^"]*")*$)/).reject { |t| t.length < min_length }
|
||||
|
||||
if terms.blank?
|
||||
@ -571,7 +592,7 @@ class Search
|
||||
SQL
|
||||
|
||||
# a bit yucky but we got to add the term back in
|
||||
elsif match.to_s.length >= SiteSetting.min_search_term_length
|
||||
elsif match.to_s.length >= min_search_term_length
|
||||
posts.where <<~SQL
|
||||
posts.id IN (
|
||||
SELECT post_id FROM post_search_data pd1
|
||||
@ -1304,4 +1325,18 @@ class Search
|
||||
!readonly_mode &&
|
||||
@opts[:type_filter] != "exclude_topics"
|
||||
end
|
||||
|
||||
def min_search_term_length
|
||||
return @opts[:min_search_term_length] if @opts[:min_search_term_length]
|
||||
|
||||
if SiteSetting.search_tokenize_chinese
|
||||
return SiteSetting.defaults.get('min_search_term_length', 'zh_CN')
|
||||
end
|
||||
|
||||
if SiteSetting.search_tokenize_japanese
|
||||
return SiteSetting.defaults.get('min_search_term_length', 'ja')
|
||||
end
|
||||
|
||||
SiteSetting.min_search_term_length
|
||||
end
|
||||
end
|
||||
|
Reference in New Issue
Block a user