FEATURE: Split up text segmentation for Chinese and Japanese.

* Chinese segmenetation will continue to rely on cppjieba
* Japanese segmentation will use our port of TinySegmenter
* Korean currently does not rely on segmentation which was dropped in c677877e4fe5381f613279901f36ae255c909573
* SiteSetting.search_tokenize_chinese_japanese_korean has been split
into SiteSetting.search_tokenize_chinese and
SiteSetting.search_tokenize_japanese respectively
This commit is contained in:
Alan Guo Xiang Tan
2022-01-26 15:24:11 +08:00
parent 9ddd1f739e
commit 930f51e175
14 changed files with 406 additions and 72 deletions

View File

@ -0,0 +1,11 @@
# frozen_string_literal: true
require 'rails_helper'
describe SearchTokenizeChineseValidator do
it 'does not allow search_tokenize_chinese to be enabled when search_tokenize_japanese is enabled' do
SiteSetting.search_tokenize_japanese = true
expect { SiteSetting.search_tokenize_chinese = true }.to raise_error(Discourse::InvalidParameters)
end
end

View File

@ -0,0 +1,11 @@
# frozen_string_literal: true
require 'rails_helper'
describe SearchTokenizeJapaneseValidator do
it 'does not allow search_tokenize_japanese to be enabled when search_tokenize_chinese is enabled' do
SiteSetting.search_tokenize_chinese = true
expect { SiteSetting.search_tokenize_japanese = true }.to raise_error(Discourse::InvalidParameters)
end
end