diff --git a/config/locales/server.en.yml b/config/locales/server.en.yml index 490c983dc13..cefe57911f4 100644 --- a/config/locales/server.en.yml +++ b/config/locales/server.en.yml @@ -1486,7 +1486,8 @@ en: min_personal_message_title_length: "Minimum allowed title length for a message in characters" max_emojis_in_title: "Maximum allowed emojis in topic title" min_search_term_length: "Minimum valid search term length in characters" - search_tokenize_chinese_japanese_korean: "Force search to tokenize Chinese/Japanese/Korean even on non CJK sites" + search_tokenize_chinese: "Force search to tokenize Chinese even on non Chinese sites" + search_tokenize_japanese: "Force search to tokenize Japanese even on non Japanese sites" search_prefer_recent_posts: "If searching your large forum is slow, this option tries an index of more recent posts first" search_recent_posts_size: "How many recent posts to keep in the index" log_search_queries: "Log search queries performed by users" @@ -1629,7 +1630,7 @@ en: allowed_iframes: "A list of iframe src domain prefixes that discourse can safely allow in posts" allowed_crawler_user_agents: "User agents of web crawlers that should be allowed to access the site. WARNING! SETTING THIS WILL DISALLOW ALL CRAWLERS NOT LISTED HERE!" blocked_crawler_user_agents: "Unique case insensitive word in the user agent string identifying web crawlers that should not be allowed to access the site. Does not apply if allowlist is defined." - slow_down_crawler_user_agents: "User agents of web crawlers that should be rate limited as configured in the \"slow down crawler rate\" setting. Each value must be at least 3 characters long." + slow_down_crawler_user_agents: 'User agents of web crawlers that should be rate limited as configured in the "slow down crawler rate" setting. Each value must be at least 3 characters long.' slow_down_crawler_rate: "If slow_down_crawler_user_agents is specified this rate will apply to all the crawlers (number of seconds delay between requests)" content_security_policy: "Enable Content-Security-Policy" content_security_policy_report_only: "Enable Content-Security-Policy-Report-Only" @@ -2396,6 +2397,8 @@ en: unicode_usernames_avatars: "The internal system avatars do not support Unicode usernames." list_value_count: "The list must contain exactly %{count} values." google_oauth2_hd_groups: "You must first set 'google oauth2 hd' before enabling this setting." + search_tokenize_chinese_enabled: "You must disable 'search_tokenize_chinese' before enabling this setting." + search_tokenize_japanese_enabled: "You must disable 'search_tokenize_japanese' before enabling this setting." placeholder: discourse_connect_provider_secrets: diff --git a/config/site_settings.yml b/config/site_settings.yml index a84fd9d5889..9ca6bde4e7f 100644 --- a/config/site_settings.yml +++ b/config/site_settings.yml @@ -2034,7 +2034,12 @@ search: zh_TW: 1 ko: 1 ja: 1 - search_tokenize_chinese_japanese_korean: false + search_tokenize_chinese: + default: false + validator: "SearchTokenizeChineseValidator" + search_tokenize_japanese: + default: false + validator: "SearchTokenizeJapaneseValidator" search_prefer_recent_posts: false search_recent_posts_size: default: 1000000 diff --git a/db/migrate/20220126052157_change_segment_cjk_site_setting.rb b/db/migrate/20220126052157_change_segment_cjk_site_setting.rb new file mode 100644 index 00000000000..4552c87b8b5 --- /dev/null +++ b/db/migrate/20220126052157_change_segment_cjk_site_setting.rb @@ -0,0 +1,20 @@ +# frozen_string_literal: true + +class ChangeSegmentCjkSiteSetting < ActiveRecord::Migration[6.1] + def up + execute <<~SQL + UPDATE site_settings + SET name = 'search_tokenize_chinese' + WHERE name = 'search_tokenize_chinese_japanese_korean' + SQL + + execute <<~SQL + DELETE FROM site_settings + WHERE name = 'search_tokenize_chinese_japanese_korean' + SQL + end + + def down + raise ActiveRecord::IrreversibleMigration + end +end diff --git a/lib/search.rb b/lib/search.rb index 4a3ab96da1e..e9a9d30bd58 100644 --- a/lib/search.rb +++ b/lib/search.rb @@ -64,9 +64,17 @@ class Search end end - def self.segment_cjk? - ['zh_TW', 'zh_CN', 'ja'].include?(SiteSetting.default_locale) || - SiteSetting.search_tokenize_chinese_japanese_korean + def self.segment_chinese? + ['zh_TW', 'zh_CN'].include?(SiteSetting.default_locale) || SiteSetting.search_tokenize_chinese + end + + def self.segment_japanese? + SiteSetting.default_locale == "ja" || SiteSetting.search_tokenize_japanese + end + + def self.japanese_punctuation_regexp + # Regexp adapted from https://github.com/6/tiny_segmenter/blob/15a5b825993dfd2c662df3766f232051716bef5b/lib/tiny_segmenter.rb#L7 + @japanese_punctuation_regexp ||= Regexp.compile("[-–—―.。・()()[]{}{}【】⟨⟩、、,,،…‥〽「」『』〜~!!::??\"'|__“”‘’;/⁄/«»]") end def self.prepare_data(search_data, purpose = nil) @@ -74,22 +82,35 @@ class Search data.force_encoding("UTF-8") if purpose != :topic - # TODO cppjieba_rb is designed for chinese, we need something else for Japanese - # Korean appears to be safe cause words are already space separated - # For Japanese we should investigate using kakasi - if segment_cjk? + if segment_chinese? require 'cppjieba_rb' unless defined? CppjiebaRb - data = CppjiebaRb.segment(search_data, mode: :mix) - # TODO: we still want to tokenize here but the current stopword list is too wide - # in cppjieba leading to words such as volume to be skipped. PG already has an English - # stopword list so use that vs relying on cppjieba - if ts_config != 'english' - data = CppjiebaRb.filter_stop_word(data) - else - data = data.filter { |s| s.present? } + segmented_data = [] + + # We need to split up the string here because Cppjieba has a bug where text starting with numeric chars will + # be split into two segments. For example, '123abc' becomes '123' and 'abc' after segmentation. + data.scan(/(?[\p{Han}。,、“”《》…\.:?!;()]+)|([^\p{Han}]+)/) do + match_data = $LAST_MATCH_INFO + + if match_data[:chinese] + segments = CppjiebaRb.segment(match_data.to_s, mode: :mix) + + if ts_config != 'english' + segments = CppjiebaRb.filter_stop_word(segments) + end + + segments = segments.filter { |s| s.present? } + segmented_data << segments.join(' ') + else + segmented_data << match_data.to_s.squish + end end + data = segmented_data.join(' ') + elsif segment_japanese? + data.gsub!(japanese_punctuation_regexp, " ") + data = TinyJapaneseSegmenter.segment(data) + data = data.filter { |s| s.present? } data = data.join(' ') else data.squish! @@ -263,7 +284,7 @@ class Search end unless @filters.present? || @opts[:search_for_id] - min_length = @opts[:min_search_term_length] || SiteSetting.min_search_term_length + min_length = min_search_term_length terms = (@term || '').split(/\s(?=(?:[^"]|"[^"]*")*$)/).reject { |t| t.length < min_length } if terms.blank? @@ -571,7 +592,7 @@ class Search SQL # a bit yucky but we got to add the term back in - elsif match.to_s.length >= SiteSetting.min_search_term_length + elsif match.to_s.length >= min_search_term_length posts.where <<~SQL posts.id IN ( SELECT post_id FROM post_search_data pd1 @@ -1304,4 +1325,18 @@ class Search !readonly_mode && @opts[:type_filter] != "exclude_topics" end + + def min_search_term_length + return @opts[:min_search_term_length] if @opts[:min_search_term_length] + + if SiteSetting.search_tokenize_chinese + return SiteSetting.defaults.get('min_search_term_length', 'zh_CN') + end + + if SiteSetting.search_tokenize_japanese + return SiteSetting.defaults.get('min_search_term_length', 'ja') + end + + SiteSetting.min_search_term_length + end end diff --git a/lib/search/grouped_search_results.rb b/lib/search/grouped_search_results.rb index 8182fe90bc5..611ee168e8e 100644 --- a/lib/search/grouped_search_results.rb +++ b/lib/search/grouped_search_results.rb @@ -87,7 +87,7 @@ class Search blurb_length: @blurb_length } - if post.post_search_data.version > SearchIndexer::MIN_POST_REINDEX_VERSION && !Search.segment_cjk? + if post.post_search_data.version > SearchIndexer::MIN_POST_REINDEX_VERSION && !Search.segment_chinese? && !Search.segment_japanese? if SiteSetting.use_pg_headlines_for_excerpt scrubbed_headline = post.headline.gsub(SCRUB_HEADLINE_REGEXP, '\1') prefix_omission = scrubbed_headline.start_with?(post.leading_raw_data) ? '' : OMISSION diff --git a/lib/site_settings/deprecated_settings.rb b/lib/site_settings/deprecated_settings.rb index 9e13541b354..36c0dfd55c5 100644 --- a/lib/site_settings/deprecated_settings.rb +++ b/lib/site_settings/deprecated_settings.rb @@ -31,6 +31,7 @@ module SiteSettings::DeprecatedSettings ['sso_overrides_card_background', 'discourse_connect_overrides_card_background', true, '2.8'], ['external_auth_skip_create_confirm', 'auth_skip_create_confirm', true, '2.8'], ['external_auth_immediately', 'auth_immediately', true, '2.8'], + ['search_tokenize_chinese_japanese_korean', 'search_tokenize_chinese', true, '2.9'], ] def setup_deprecated_methods diff --git a/lib/tiny_japanese_segmenter.rb b/lib/tiny_japanese_segmenter.rb new file mode 100644 index 00000000000..5da3175a36f --- /dev/null +++ b/lib/tiny_japanese_segmenter.rb @@ -0,0 +1,173 @@ +# frozen_string_literal: true + +# Ruby port of http://chasen.org/~taku/software/TinySegmenter/tiny_segmenter-0.2.js +# This is esstentially a trained machine learning model used to segment words in Japanese. +# Discourse core uses it for "best effort" segmentation of Japanese text for search. +class TinyJapaneseSegmenter + CHARTYPE = { + "[一二三四五六七八九十百千万億兆]" => "M", + "[一-龠々〆ヵヶ]" => "H", + "[ぁ-ん]" => "I", + "[ァ-ヴーア-ン゙ー]" => "K", + "[a-zA-Za-zA-Z]" => "A", + "[0-90-9]" => "N" + }.map do |pattern, value| + [Regexp.compile(pattern), value] + end + + BIAS = -322 + BC1 = { "HH" => 6, "II" => 2461, "KH" => 406, "OH" => -1378 } + BC2 = { "AA" => -3267, "AI" => 2744, "AN" => -878, "HH" => -4070, "HM" => -1711, "HN" => 4012, "HO" => 3761, "IA" => 1327, "IH" => -1184, "II" => -1332, "IK" => 1721, "IO" => 5492, "KI" => 3831, "KK" => -8741, "MH" => -3132, "MK" => 3334, "OO" => -2920 } + BC3 = { "HH" => 996, "HI" => 626, "HK" => -721, "HN" => -1307, "HO" => -836, "IH" => -301, "KK" => 2762, "MK" => 1079, "MM" => 4034, "OA" => -1652, "OH" => 266 } + BP1 = { "BB" => 295, "OB" => 304, "OO" => -125, "UB" => 352 } + BP2 = { "BO" => 60, "OO" => -1762 } + BQ1 = { "BHH" => 1150, "BHM" => 1521, "BII" => -1158, "BIM" => 886, "BMH" => 1208, "BNH" => 449, "BOH" => -91, "BOO" => -2597, "OHI" => 451, "OIH" => -296, "OKA" => 1851, "OKH" => -1020, "OKK" => 904, "OOO" => 2965 } + BQ2 = { "BHH" => 118, "BHI" => -1159, "BHM" => 466, "BIH" => -919, "BKK" => -1720, "BKO" => 864, "OHH" => -1139, "OHM" => -181, "OIH" => 153, "UHI" => -1146 } + BQ3 = { "BHH" => -792, "BHI" => 2664, "BII" => -299, "BKI" => 419, "BMH" => 937, "BMM" => 8335, "BNN" => 998, "BOH" => 775, "OHH" => 2174, "OHM" => 439, "OII" => 280, "OKH" => 1798, "OKI" => -793, "OKO" => -2242, "OMH" => -2402, "OOO" => 11699 } + BQ4 = { "BHH" => -3895, "BIH" => 3761, "BII" => -4654, "BIK" => 1348, "BKK" => -1806, "BMI" => -3385, "BOO" => -12396, "OAH" => 926, "OHH" => 266, "OHK" => -2036, "ONN" => -973 } + BW1 = { ",と" => 660, ",同" => 727, "B1あ" => 1404, "B1同" => 542, "、と" => 660, "、同" => 727, "」と" => 1682, "あっ" => 1505, "いう" => 1743, "いっ" => -2055, "いる" => 672, "うし" => -4817, "うん" => 665, "から" => 3472, "がら" => 600, "こう" => -790, "こと" => 2083, "こん" => -1262, "さら" => -4143, "さん" => 4573, "した" => 2641, "して" => 1104, "すで" => -3399, "そこ" => 1977, "それ" => -871, "たち" => 1122, "ため" => 601, "った" => 3463, "つい" => -802, "てい" => 805, "てき" => 1249, "でき" => 1127, "です" => 3445, "では" => 844, "とい" => -4915, "とみ" => 1922, "どこ" => 3887, "ない" => 5713, "なっ" => 3015, "など" => 7379, "なん" => -1113, "にし" => 2468, "には" => 1498, "にも" => 1671, "に対" => -912, "の一" => -501, "の中" => 741, "ませ" => 2448, "まで" => 1711, "まま" => 2600, "まる" => -2155, "やむ" => -1947, "よっ" => -2565, "れた" => 2369, "れで" => -913, "をし" => 1860, "を見" => 731, "亡く" => -1886, "京都" => 2558, "取り" => -2784, "大き" => -2604, "大阪" => 1497, "平方" => -2314, "引き" => -1336, "日本" => -195, "本当" => -2423, "毎日" => -2113, "目指" => -724, "B1あ" => 1404, "B1同" => 542, "」と" => 1682 } + BW2 = { ".." => -11822, "11" => -669, "――" => -5730, "−−" => -13175, "いう" => -1609, "うか" => 2490, "かし" => -1350, "かも" => -602, "から" => -7194, "かれ" => 4612, "がい" => 853, "がら" => -3198, "きた" => 1941, "くな" => -1597, "こと" => -8392, "この" => -4193, "させ" => 4533, "され" => 13168, "さん" => -3977, "しい" => -1819, "しか" => -545, "した" => 5078, "して" => 972, "しな" => 939, "その" => -3744, "たい" => -1253, "たた" => -662, "ただ" => -3857, "たち" => -786, "たと" => 1224, "たは" => -939, "った" => 4589, "って" => 1647, "っと" => -2094, "てい" => 6144, "てき" => 3640, "てく" => 2551, "ては" => -3110, "ても" => -3065, "でい" => 2666, "でき" => -1528, "でし" => -3828, "です" => -4761, "でも" => -4203, "とい" => 1890, "とこ" => -1746, "とと" => -2279, "との" => 720, "とみ" => 5168, "とも" => -3941, "ない" => -2488, "なが" => -1313, "など" => -6509, "なの" => 2614, "なん" => 3099, "にお" => -1615, "にし" => 2748, "にな" => 2454, "によ" => -7236, "に対" => -14943, "に従" => -4688, "に関" => -11388, "のか" => 2093, "ので" => -7059, "のに" => -6041, "のの" => -6125, "はい" => 1073, "はが" => -1033, "はず" => -2532, "ばれ" => 1813, "まし" => -1316, "まで" => -6621, "まれ" => 5409, "めて" => -3153, "もい" => 2230, "もの" => -10713, "らか" => -944, "らし" => -1611, "らに" => -1897, "りし" => 651, "りま" => 1620, "れた" => 4270, "れて" => 849, "れば" => 4114, "ろう" => 6067, "われ" => 7901, "を通" => -11877, "んだ" => 728, "んな" => -4115, "一人" => 602, "一方" => -1375, "一日" => 970, "一部" => -1051, "上が" => -4479, "会社" => -1116, "出て" => 2163, "分の" => -7758, "同党" => 970, "同日" => -913, "大阪" => -2471, "委員" => -1250, "少な" => -1050, "年度" => -8669, "年間" => -1626, "府県" => -2363, "手権" => -1982, "新聞" => -4066, "日新" => -722, "日本" => -7068, "日米" => 3372, "曜日" => -601, "朝鮮" => -2355, "本人" => -2697, "東京" => -1543, "然と" => -1384, "社会" => -1276, "立て" => -990, "第に" => -1612, "米国" => -4268, "11" => -669 } + BW3 = { "あた" => -2194, "あり" => 719, "ある" => 3846, "い." => -1185, "い。" => -1185, "いい" => 5308, "いえ" => 2079, "いく" => 3029, "いた" => 2056, "いっ" => 1883, "いる" => 5600, "いわ" => 1527, "うち" => 1117, "うと" => 4798, "えと" => 1454, "か." => 2857, "か。" => 2857, "かけ" => -743, "かっ" => -4098, "かに" => -669, "から" => 6520, "かり" => -2670, "が," => 1816, "が、" => 1816, "がき" => -4855, "がけ" => -1127, "がっ" => -913, "がら" => -4977, "がり" => -2064, "きた" => 1645, "けど" => 1374, "こと" => 7397, "この" => 1542, "ころ" => -2757, "さい" => -714, "さを" => 976, "し," => 1557, "し、" => 1557, "しい" => -3714, "した" => 3562, "して" => 1449, "しな" => 2608, "しま" => 1200, "す." => -1310, "す。" => -1310, "する" => 6521, "ず," => 3426, "ず、" => 3426, "ずに" => 841, "そう" => 428, "た." => 8875, "た。" => 8875, "たい" => -594, "たの" => 812, "たり" => -1183, "たる" => -853, "だ." => 4098, "だ。" => 4098, "だっ" => 1004, "った" => -4748, "って" => 300, "てい" => 6240, "てお" => 855, "ても" => 302, "です" => 1437, "でに" => -1482, "では" => 2295, "とう" => -1387, "とし" => 2266, "との" => 541, "とも" => -3543, "どう" => 4664, "ない" => 1796, "なく" => -903, "など" => 2135, "に," => -1021, "に、" => -1021, "にし" => 1771, "にな" => 1906, "には" => 2644, "の," => -724, "の、" => -724, "の子" => -1000, "は," => 1337, "は、" => 1337, "べき" => 2181, "まし" => 1113, "ます" => 6943, "まっ" => -1549, "まで" => 6154, "まれ" => -793, "らし" => 1479, "られ" => 6820, "るる" => 3818, "れ," => 854, "れ、" => 854, "れた" => 1850, "れて" => 1375, "れば" => -3246, "れる" => 1091, "われ" => -605, "んだ" => 606, "んで" => 798, "カ月" => 990, "会議" => 860, "入り" => 1232, "大会" => 2217, "始め" => 1681, "市" => 965, "新聞" => -5055, "日," => 974, "日、" => 974, "社会" => 2024, "カ月" => 990 } + TC1 = { "AAA" => 1093, "HHH" => 1029, "HHM" => 580, "HII" => 998, "HOH" => -390, "HOM" => -331, "IHI" => 1169, "IOH" => -142, "IOI" => -1015, "IOM" => 467, "MMH" => 187, "OOI" => -1832 } + TC2 = { "HHO" => 2088, "HII" => -1023, "HMM" => -1154, "IHI" => -1965, "KKH" => 703, "OII" => -2649 } + TC3 = { "AAA" => -294, "HHH" => 346, "HHI" => -341, "HII" => -1088, "HIK" => 731, "HOH" => -1486, "IHH" => 128, "IHI" => -3041, "IHO" => -1935, "IIH" => -825, "IIM" => -1035, "IOI" => -542, "KHH" => -1216, "KKA" => 491, "KKH" => -1217, "KOK" => -1009, "MHH" => -2694, "MHM" => -457, "MHO" => 123, "MMH" => -471, "NNH" => -1689, "NNO" => 662, "OHO" => -3393 } + TC4 = { "HHH" => -203, "HHI" => 1344, "HHK" => 365, "HHM" => -122, "HHN" => 182, "HHO" => 669, "HIH" => 804, "HII" => 679, "HOH" => 446, "IHH" => 695, "IHO" => -2324, "IIH" => 321, "III" => 1497, "IIO" => 656, "IOO" => 54, "KAK" => 4845, "KKA" => 3386, "KKK" => 3065, "MHH" => -405, "MHI" => 201, "MMH" => -241, "MMM" => 661, "MOM" => 841 } + TQ1 = { "BHHH" => -227, "BHHI" => 316, "BHIH" => -132, "BIHH" => 60, "BIII" => 1595, "BNHH" => -744, "BOHH" => 225, "BOOO" => -908, "OAKK" => 482, "OHHH" => 281, "OHIH" => 249, "OIHI" => 200, "OIIH" => -68 } + TQ2 = { "BIHH" => -1401, "BIII" => -1033, "BKAK" => -543, "BOOO" => -5591 } + TQ3 = { "BHHH" => 478, "BHHM" => -1073, "BHIH" => 222, "BHII" => -504, "BIIH" => -116, "BIII" => -105, "BMHI" => -863, "BMHM" => -464, "BOMH" => 620, "OHHH" => 346, "OHHI" => 1729, "OHII" => 997, "OHMH" => 481, "OIHH" => 623, "OIIH" => 1344, "OKAK" => 2792, "OKHH" => 587, "OKKA" => 679, "OOHH" => 110, "OOII" => -685 } + TQ4 = { "BHHH" => -721, "BHHM" => -3604, "BHII" => -966, "BIIH" => -607, "BIII" => -2181, "OAAA" => -2763, "OAKK" => 180, "OHHH" => -294, "OHHI" => 2446, "OHHO" => 480, "OHIH" => -1573, "OIHH" => 1935, "OIHI" => -493, "OIIH" => 626, "OIII" => -4007, "OKAK" => -8156 } + TW1 = { "につい" => -4681, "東京都" => 2026 } + TW2 = { "ある程" => -2049, "いった" => -1256, "ころが" => -2434, "しょう" => 3873, "その後" => -4430, "だって" => -1049, "ていた" => 1833, "として" => -4657, "ともに" => -4517, "もので" => 1882, "一気に" => -792, "初めて" => -1512, "同時に" => -8097, "大きな" => -1255, "対して" => -2721, "社会党" => -3216 } + TW3 = { "いただ" => -1734, "してい" => 1314, "として" => -4314, "につい" => -5483, "にとっ" => -5989, "に当た" => -6247, "ので," => -727, "ので、" => -727, "のもの" => -600, "れから" => -3752, "十二月" => -2287 } + TW4 = { "いう." => 8576, "いう。" => 8576, "からな" => -2348, "してい" => 2958, "たが," => 1516, "たが、" => 1516, "ている" => 1538, "という" => 1349, "ました" => 5543, "ません" => 1097, "ようと" => -4258, "よると" => 5865 } + UC1 = { "A" => 484, "K" => 93, "M" => 645, "O" => -505 } + UC2 = { "A" => 819, "H" => 1059, "I" => 409, "M" => 3987, "N" => 5775, "O" => 646 } + UC3 = { "A" => -1370, "I" => 2311 } + UC4 = { "A" => -2643, "H" => 1809, "I" => -1032, "K" => -3450, "M" => 3565, "N" => 3876, "O" => 6646 } + UC5 = { "H" => 313, "I" => -1238, "K" => -799, "M" => 539, "O" => -831 } + UC6 = { "H" => -506, "I" => -253, "K" => 87, "M" => 247, "O" => -387 } + UP1 = { "O" => -214 } + UP2 = { "B" => 69, "O" => 935 } + UP3 = { "B" => 189 } + UQ1 = { "BH" => 21, "BI" => -12, "BK" => -99, "BN" => 142, "BO" => -56, "OH" => -95, "OI" => 477, "OK" => 410, "OO" => -2422 } + UQ2 = { "BH" => 216, "BI" => 113, "OK" => 1759 } + UQ3 = { "BA" => -479, "BH" => 42, "BI" => 1913, "BK" => -7198, "BM" => 3160, "BN" => 6427, "BO" => 14761, "OI" => -827, "ON" => -3212 } + UW1 = { "," => 156, "、" => 156, "「" => -463, "あ" => -941, "う" => -127, "が" => -553, "き" => 121, "こ" => 505, "で" => -201, "と" => -547, "ど" => -123, "に" => -789, "の" => -185, "は" => -847, "も" => -466, "や" => -470, "よ" => 182, "ら" => -292, "り" => 208, "れ" => 169, "を" => -446, "ん" => -137, "・" => -135, "主" => -402, "京" => -268, "区" => -912, "午" => 871, "国" => -460, "大" => 561, "委" => 729, "市" => -411, "日" => -141, "理" => 361, "生" => -408, "県" => -386, "都" => -718, "「" => -463, "・" => -135 } + UW2 = { "," => -829, "、" => -829, "〇" => 892, "「" => -645, "」" => 3145, "あ" => -538, "い" => 505, "う" => 134, "お" => -502, "か" => 1454, "が" => -856, "く" => -412, "こ" => 1141, "さ" => 878, "ざ" => 540, "し" => 1529, "す" => -675, "せ" => 300, "そ" => -1011, "た" => 188, "だ" => 1837, "つ" => -949, "て" => -291, "で" => -268, "と" => -981, "ど" => 1273, "な" => 1063, "に" => -1764, "の" => 130, "は" => -409, "ひ" => -1273, "べ" => 1261, "ま" => 600, "も" => -1263, "や" => -402, "よ" => 1639, "り" => -579, "る" => -694, "れ" => 571, "を" => -2516, "ん" => 2095, "ア" => -587, "カ" => 306, "キ" => 568, "ッ" => 831, "三" => -758, "不" => -2150, "世" => -302, "中" => -968, "主" => -861, "事" => 492, "人" => -123, "会" => 978, "保" => 362, "入" => 548, "初" => -3025, "副" => -1566, "北" => -3414, "区" => -422, "大" => -1769, "天" => -865, "太" => -483, "子" => -1519, "学" => 760, "実" => 1023, "小" => -2009, "市" => -813, "年" => -1060, "強" => 1067, "手" => -1519, "揺" => -1033, "政" => 1522, "文" => -1355, "新" => -1682, "日" => -1815, "明" => -1462, "最" => -630, "朝" => -1843, "本" => -1650, "東" => -931, "果" => -665, "次" => -2378, "民" => -180, "気" => -1740, "理" => 752, "発" => 529, "目" => -1584, "相" => -242, "県" => -1165, "立" => -763, "第" => 810, "米" => 509, "自" => -1353, "行" => 838, "西" => -744, "見" => -3874, "調" => 1010, "議" => 1198, "込" => 3041, "開" => 1758, "間" => -1257, "「" => -645, "」" => 3145, "ッ" => 831, "ア" => -587, "カ" => 306, "キ" => 568 } + UW3 = { "," => 4889, "1" => -800, "−" => -1723, "、" => 4889, "々" => -2311, "〇" => 5827, "」" => 2670, "〓" => -3573, "あ" => -2696, "い" => 1006, "う" => 2342, "え" => 1983, "お" => -4864, "か" => -1163, "が" => 3271, "く" => 1004, "け" => 388, "げ" => 401, "こ" => -3552, "ご" => -3116, "さ" => -1058, "し" => -395, "す" => 584, "せ" => 3685, "そ" => -5228, "た" => 842, "ち" => -521, "っ" => -1444, "つ" => -1081, "て" => 6167, "で" => 2318, "と" => 1691, "ど" => -899, "な" => -2788, "に" => 2745, "の" => 4056, "は" => 4555, "ひ" => -2171, "ふ" => -1798, "へ" => 1199, "ほ" => -5516, "ま" => -4384, "み" => -120, "め" => 1205, "も" => 2323, "や" => -788, "よ" => -202, "ら" => 727, "り" => 649, "る" => 5905, "れ" => 2773, "わ" => -1207, "を" => 6620, "ん" => -518, "ア" => 551, "グ" => 1319, "ス" => 874, "ッ" => -1350, "ト" => 521, "ム" => 1109, "ル" => 1591, "ロ" => 2201, "ン" => 278, "・" => -3794, "一" => -1619, "下" => -1759, "世" => -2087, "両" => 3815, "中" => 653, "主" => -758, "予" => -1193, "二" => 974, "人" => 2742, "今" => 792, "他" => 1889, "以" => -1368, "低" => 811, "何" => 4265, "作" => -361, "保" => -2439, "元" => 4858, "党" => 3593, "全" => 1574, "公" => -3030, "六" => 755, "共" => -1880, "円" => 5807, "再" => 3095, "分" => 457, "初" => 2475, "別" => 1129, "前" => 2286, "副" => 4437, "力" => 365, "動" => -949, "務" => -1872, "化" => 1327, "北" => -1038, "区" => 4646, "千" => -2309, "午" => -783, "協" => -1006, "口" => 483, "右" => 1233, "各" => 3588, "合" => -241, "同" => 3906, "和" => -837, "員" => 4513, "国" => 642, "型" => 1389, "場" => 1219, "外" => -241, "妻" => 2016, "学" => -1356, "安" => -423, "実" => -1008, "家" => 1078, "小" => -513, "少" => -3102, "州" => 1155, "市" => 3197, "平" => -1804, "年" => 2416, "広" => -1030, "府" => 1605, "度" => 1452, "建" => -2352, "当" => -3885, "得" => 1905, "思" => -1291, "性" => 1822, "戸" => -488, "指" => -3973, "政" => -2013, "教" => -1479, "数" => 3222, "文" => -1489, "新" => 1764, "日" => 2099, "旧" => 5792, "昨" => -661, "時" => -1248, "曜" => -951, "最" => -937, "月" => 4125, "期" => 360, "李" => 3094, "村" => 364, "東" => -805, "核" => 5156, "森" => 2438, "業" => 484, "氏" => 2613, "民" => -1694, "決" => -1073, "法" => 1868, "海" => -495, "無" => 979, "物" => 461, "特" => -3850, "生" => -273, "用" => 914, "町" => 1215, "的" => 7313, "直" => -1835, "省" => 792, "県" => 6293, "知" => -1528, "私" => 4231, "税" => 401, "立" => -960, "第" => 1201, "米" => 7767, "系" => 3066, "約" => 3663, "級" => 1384, "統" => -4229, "総" => 1163, "線" => 1255, "者" => 6457, "能" => 725, "自" => -2869, "英" => 785, "見" => 1044, "調" => -562, "財" => -733, "費" => 1777, "車" => 1835, "軍" => 1375, "込" => -1504, "通" => -1136, "選" => -681, "郎" => 1026, "郡" => 4404, "部" => 1200, "金" => 2163, "長" => 421, "開" => -1432, "間" => 1302, "関" => -1282, "雨" => 2009, "電" => -1045, "非" => 2066, "駅" => 1620, "1" => -800, "」" => 2670, "・" => -3794, "ッ" => -1350, "ア" => 551, "グ" => 1319, "ス" => 874, "ト" => 521, "ム" => 1109, "ル" => 1591, "ロ" => 2201, "ン" => 278 } + UW4 = { "," => 3930, "." => 3508, "―" => -4841, "、" => 3930, "。" => 3508, "〇" => 4999, "「" => 1895, "」" => 3798, "〓" => -5156, "あ" => 4752, "い" => -3435, "う" => -640, "え" => -2514, "お" => 2405, "か" => 530, "が" => 6006, "き" => -4482, "ぎ" => -3821, "く" => -3788, "け" => -4376, "げ" => -4734, "こ" => 2255, "ご" => 1979, "さ" => 2864, "し" => -843, "じ" => -2506, "す" => -731, "ず" => 1251, "せ" => 181, "そ" => 4091, "た" => 5034, "だ" => 5408, "ち" => -3654, "っ" => -5882, "つ" => -1659, "て" => 3994, "で" => 7410, "と" => 4547, "な" => 5433, "に" => 6499, "ぬ" => 1853, "ね" => 1413, "の" => 7396, "は" => 8578, "ば" => 1940, "ひ" => 4249, "び" => -4134, "ふ" => 1345, "へ" => 6665, "べ" => -744, "ほ" => 1464, "ま" => 1051, "み" => -2082, "む" => -882, "め" => -5046, "も" => 4169, "ゃ" => -2666, "や" => 2795, "ょ" => -1544, "よ" => 3351, "ら" => -2922, "り" => -9726, "る" => -14896, "れ" => -2613, "ろ" => -4570, "わ" => -1783, "を" => 13150, "ん" => -2352, "カ" => 2145, "コ" => 1789, "セ" => 1287, "ッ" => -724, "ト" => -403, "メ" => -1635, "ラ" => -881, "リ" => -541, "ル" => -856, "ン" => -3637, "・" => -4371, "ー" => -11870, "一" => -2069, "中" => 2210, "予" => 782, "事" => -190, "井" => -1768, "人" => 1036, "以" => 544, "会" => 950, "体" => -1286, "作" => 530, "側" => 4292, "先" => 601, "党" => -2006, "共" => -1212, "内" => 584, "円" => 788, "初" => 1347, "前" => 1623, "副" => 3879, "力" => -302, "動" => -740, "務" => -2715, "化" => 776, "区" => 4517, "協" => 1013, "参" => 1555, "合" => -1834, "和" => -681, "員" => -910, "器" => -851, "回" => 1500, "国" => -619, "園" => -1200, "地" => 866, "場" => -1410, "塁" => -2094, "士" => -1413, "多" => 1067, "大" => 571, "子" => -4802, "学" => -1397, "定" => -1057, "寺" => -809, "小" => 1910, "屋" => -1328, "山" => -1500, "島" => -2056, "川" => -2667, "市" => 2771, "年" => 374, "庁" => -4556, "後" => 456, "性" => 553, "感" => 916, "所" => -1566, "支" => 856, "改" => 787, "政" => 2182, "教" => 704, "文" => 522, "方" => -856, "日" => 1798, "時" => 1829, "最" => 845, "月" => -9066, "木" => -485, "来" => -442, "校" => -360, "業" => -1043, "氏" => 5388, "民" => -2716, "気" => -910, "沢" => -939, "済" => -543, "物" => -735, "率" => 672, "球" => -1267, "生" => -1286, "産" => -1101, "田" => -2900, "町" => 1826, "的" => 2586, "目" => 922, "省" => -3485, "県" => 2997, "空" => -867, "立" => -2112, "第" => 788, "米" => 2937, "系" => 786, "約" => 2171, "経" => 1146, "統" => -1169, "総" => 940, "線" => -994, "署" => 749, "者" => 2145, "能" => -730, "般" => -852, "行" => -792, "規" => 792, "警" => -1184, "議" => -244, "谷" => -1000, "賞" => 730, "車" => -1481, "軍" => 1158, "輪" => -1433, "込" => -3370, "近" => 929, "道" => -1291, "選" => 2596, "郎" => -4866, "都" => 1192, "野" => -1100, "銀" => -2213, "長" => 357, "間" => -2344, "院" => -2297, "際" => -2604, "電" => -878, "領" => -1659, "題" => -792, "館" => -1984, "首" => 1749, "高" => 2120, "「" => 1895, "」" => 3798, "・" => -4371, "ッ" => -724, "ー" => -11870, "カ" => 2145, "コ" => 1789, "セ" => 1287, "ト" => -403, "メ" => -1635, "ラ" => -881, "リ" => -541, "ル" => -856, "ン" => -3637 } + UW5 = { "," => 465, "." => -299, "1" => -514, "E2" => -32768, "]" => -2762, "、" => 465, "。" => -299, "「" => 363, "あ" => 1655, "い" => 331, "う" => -503, "え" => 1199, "お" => 527, "か" => 647, "が" => -421, "き" => 1624, "ぎ" => 1971, "く" => 312, "げ" => -983, "さ" => -1537, "し" => -1371, "す" => -852, "だ" => -1186, "ち" => 1093, "っ" => 52, "つ" => 921, "て" => -18, "で" => -850, "と" => -127, "ど" => 1682, "な" => -787, "に" => -1224, "の" => -635, "は" => -578, "べ" => 1001, "み" => 502, "め" => 865, "ゃ" => 3350, "ょ" => 854, "り" => -208, "る" => 429, "れ" => 504, "わ" => 419, "を" => -1264, "ん" => 327, "イ" => 241, "ル" => 451, "ン" => -343, "中" => -871, "京" => 722, "会" => -1153, "党" => -654, "務" => 3519, "区" => -901, "告" => 848, "員" => 2104, "大" => -1296, "学" => -548, "定" => 1785, "嵐" => -1304, "市" => -2991, "席" => 921, "年" => 1763, "思" => 872, "所" => -814, "挙" => 1618, "新" => -1682, "日" => 218, "月" => -4353, "査" => 932, "格" => 1356, "機" => -1508, "氏" => -1347, "田" => 240, "町" => -3912, "的" => -3149, "相" => 1319, "省" => -1052, "県" => -4003, "研" => -997, "社" => -278, "空" => -813, "統" => 1955, "者" => -2233, "表" => 663, "語" => -1073, "議" => 1219, "選" => -1018, "郎" => -368, "長" => 786, "間" => 1191, "題" => 2368, "館" => -689, "1" => -514, "E2" => -32768, "「" => 363, "イ" => 241, "ル" => 451, "ン" => -343 } + UW6 = { "," => 227, "." => 808, "1" => -270, "E1" => 306, "、" => 227, "。" => 808, "あ" => -307, "う" => 189, "か" => 241, "が" => -73, "く" => -121, "こ" => -200, "じ" => 1782, "す" => 383, "た" => -428, "っ" => 573, "て" => -1014, "で" => 101, "と" => -105, "な" => -253, "に" => -149, "の" => -417, "は" => -236, "も" => -206, "り" => 187, "る" => -135, "を" => 195, "ル" => -673, "ン" => -496, "一" => -277, "中" => 201, "件" => -800, "会" => 624, "前" => 302, "区" => 1792, "員" => -1212, "委" => 798, "学" => -960, "市" => 887, "広" => -695, "後" => 535, "業" => -697, "相" => 753, "社" => -507, "福" => 974, "空" => -822, "者" => 1811, "連" => 463, "郎" => 1082, "1" => -270, "E1" => 306, "ル" => -673, "ン" => -496 } + + class << self + def segment(text) + return [] if text.nil? || text.strip.length == 0 + + result = [] + + segments = ["B3", "B2", "B1"] + ctypes = ["O", "O", "O"] + + text.chars.each do |char| + segments << char + ctypes << ctype(char) + end + + segments.concat(["E1", "E2", "E3"]) + ctypes.concat(["O", "O", "O"]) + + word = segments[3] + p1 = "U" + p2 = "U" + p3 = "U" + + 4.upto(segments.size - 4) do |i| + score = BIAS + w1 = segments[i - 3] + w2 = segments[i - 2] + w3 = segments[i - 1] + w4 = segments[i] + w5 = segments[i + 1] + w6 = segments[i + 2] + c1 = ctypes[i - 3] + c2 = ctypes[i - 2] + c3 = ctypes[i - 1] + c4 = ctypes[i] + c5 = ctypes[i + 1] + c6 = ctypes[i + 2] + score += UP1[p1].to_i + score += UP2[p2].to_i + score += UP3[p3].to_i + score += BP1[p1 + p2].to_i + score += BP2[p2 + p3].to_i + score += UW1[w1].to_i + score += UW2[w2].to_i + score += UW3[w3].to_i + score += UW4[w4].to_i + score += UW5[w5].to_i + score += UW6[w6].to_i + score += BW1[w2 + w3].to_i + score += BW2[w3 + w4].to_i + score += BW3[w4 + w5].to_i + score += TW1[w1 + w2 + w3].to_i + score += TW2[w2 + w3 + w4].to_i + score += TW3[w3 + w4 + w5].to_i + score += TW4[w4 + w5 + w6].to_i + score += UC1[c1].to_i + score += UC2[c2].to_i + score += UC3[c3].to_i + score += UC4[c4].to_i + score += UC5[c5].to_i + score += UC6[c6].to_i + score += BC1[c2 + c3].to_i + score += BC2[c3 + c4].to_i + score += BC3[c4 + c5].to_i + score += TC1[c1 + c2 + c3].to_i + score += TC2[c2 + c3 + c4].to_i + score += TC3[c3 + c4 + c5].to_i + score += TC4[c4 + c5 + c6].to_i + # score += TC5[c4 + c5 + c6].to_i + score += UQ1[p1 + c1].to_i + score += UQ2[p2 + c2].to_i + score += UQ3[p3 + c3].to_i + score += BQ1[p2 + c2 + c3].to_i + score += BQ2[p2 + c3 + c4].to_i + score += BQ3[p3 + c2 + c3].to_i + score += BQ4[p3 + c3 + c4].to_i + score += TQ1[p2 + c1 + c2 + c3].to_i + score += TQ2[p2 + c2 + c3 + c4].to_i + score += TQ3[p3 + c1 + c2 + c3].to_i + score += TQ4[p3 + c2 + c3 + c4].to_i + + p = "O" + + if score > 0 + result.push(word) + word = "" + p = "B" + end + + p1 = p2 + p2 = p3 + p3 = p + word += segments[i] + end + + result.push(word) + + result + end + + private + + def ctype(text) + CHARTYPE.each do |regexp, value| + if text.match(regexp) + return value + end + end + + "O" + end + end +end diff --git a/lib/validators/search_tokenize_chinese_validator.rb b/lib/validators/search_tokenize_chinese_validator.rb new file mode 100644 index 00000000000..0cda74ea10c --- /dev/null +++ b/lib/validators/search_tokenize_chinese_validator.rb @@ -0,0 +1,14 @@ +# frozen_string_literal: true + +class SearchTokenizeChineseValidator + def initialize(opts = {}) + end + + def valid_value?(value) + !SiteSetting.search_tokenize_japanese + end + + def error_message + I18n.t("site_settings.errors.search_tokenize_japanese_enabled") + end +end diff --git a/lib/validators/search_tokenize_japanese_validator.rb b/lib/validators/search_tokenize_japanese_validator.rb new file mode 100644 index 00000000000..7ad07a9c88b --- /dev/null +++ b/lib/validators/search_tokenize_japanese_validator.rb @@ -0,0 +1,14 @@ +# frozen_string_literal: true + +class SearchTokenizeJapaneseValidator + def initialize(opts = {}) + end + + def valid_value?(value) + !SiteSetting.search_tokenize_chinese + end + + def error_message + I18n.t("site_settings.errors.search_tokenize_chinese_enabled") + end +end diff --git a/spec/components/search_spec.rb b/spec/components/search_spec.rb index f47ca2438ea..cff820c6025 100644 --- a/spec/components/search_spec.rb +++ b/spec/components/search_spec.rb @@ -1099,26 +1099,85 @@ describe Search do end - describe 'Chinese search' do - let(:sentence) { 'Discourse中国的基础设施网络正在组装' } - let(:sentence_t) { 'Discourse太平山森林遊樂區' } + context 'Japanese search' do + let!(:topic) { Fabricate(:topic) } + let!(:post) { Fabricate(:post, topic: topic, raw: 'This is some japanese text 日本が大好きです。') } + let!(:topic_2) { Fabricate(:topic, title: '日本の話題、 more japanese text') } + let!(:post_2) { Fabricate(:post, topic: topic_2) } - it 'splits English / Chinese and filter out stop words' do + describe '.prepare_data' do + it 'removes punctuations' do + SiteSetting.search_tokenize_japanese = true + + expect(Search.prepare_data(post.raw)).to eq("This is some japanese text 日本 が 大好き です") + end + end + + describe '.execute' do + before do + @old_default = SiteSetting.defaults.get(:min_search_term_length) + SiteSetting.defaults.set_regardless_of_locale(:min_search_term_length, 1) + SiteSetting.refresh! + end + + after do + SiteSetting.defaults.set_regardless_of_locale(:min_search_term_length, @old_default) + SiteSetting.refresh! + end + + it 'finds posts containing Japanese text if tokenization is forced' do + SiteSetting.search_tokenize_japanese = true + + expect(Search.execute('日本').posts.map(&:id)).to eq([post_2.id, post.id]) + expect(Search.execute('日').posts.map(&:id)).to eq([post_2.id, post.id]) + end + + it "find posts containing search term when site's locale is set to Japanese" do + SiteSetting.default_locale = 'ja' + + expect(Search.execute('日本').posts.map(&:id)).to eq([post_2.id, post.id]) + expect(Search.execute('日').posts.map(&:id)).to eq([post_2.id, post.id]) + end + + it 'does not include superfluous spaces in blurbs' do + SiteSetting.default_locale = 'ja' + + post.update!(raw: '場サアマネ織企ういかせ竹域ヱイマ穂基ホ神3予読ずねいぱ松査ス禁多サウ提懸イふ引小43改こょドめ。深とつぐ主思料農ぞかル者杯検める活分えほづぼ白犠') + + results = Search.execute('ういかせ竹域', type_filter: 'topic') + + expect(results.posts.length).to eq(1) + expect(results.blurb(results.posts.first)).to include('ういかせ竹域') + end + end + end + + describe 'Chinese search' do + let(:sentence) { 'Discourse is a software company 中国的基础设施网络正在组装。' } + let(:sentence_t) { 'Discourse is a software company 太平山森林遊樂區。' } + + it 'splits English / Chinese and filter out Chinese stop words' do SiteSetting.default_locale = 'zh_CN' - data = Search.prepare_data(sentence).split(' ') - expect(data).to eq(["Discourse", "中国", "基础设施", "网络", "正在", "组装"]) + data = Search.prepare_data(sentence) + expect(data).to eq("Discourse is a software company 中国 基础设施 网络 正在 组装") end it 'splits for indexing and filter out stop words' do SiteSetting.default_locale = 'zh_CN' - data = Search.prepare_data(sentence, :index).split(' ') - expect(data).to eq(["Discourse", "中国", "基础设施", "网络", "正在", "组装"]) + data = Search.prepare_data(sentence, :index) + expect(data).to eq("Discourse is a software company 中国 基础设施 网络 正在 组装") end it 'splits English / Traditional Chinese and filter out stop words' do SiteSetting.default_locale = 'zh_TW' - data = Search.prepare_data(sentence_t).split(' ') - expect(data).to eq(["Discourse", "太平山", "森林", "遊樂區"]) + data = Search.prepare_data(sentence_t) + expect(data).to eq("Discourse is a software company 太平山 森林 遊樂區") + end + + it 'does not split strings beginning with numeric chars into different segments' do + SiteSetting.default_locale = 'zh_TW' + data = Search.prepare_data("#{sentence} 123abc") + expect(data).to eq("Discourse is a software company 中国 基础设施 网络 正在 组装 123abc") end it 'finds chinese topic based on title' do @@ -1126,6 +1185,7 @@ describe Search do SiteSetting.default_locale = 'zh_TW' SiteSetting.min_search_term_length = 1 + topic = Fabricate(:topic, title: 'My Title Discourse社區指南') post = Fabricate(:post, topic: topic) @@ -1136,14 +1196,23 @@ describe Search do it 'finds chinese topic based on title if tokenization is forced' do skip("skipped until pg app installs the db correctly") if RbConfig::CONFIG["arch"] =~ /darwin/ - SiteSetting.search_tokenize_chinese_japanese_korean = true - SiteSetting.min_search_term_length = 1 + begin + SiteSetting.search_tokenize_chinese = true + default_min_search_term_length = SiteSetting.defaults.get(:min_search_term_length) + SiteSetting.defaults.set_regardless_of_locale(:min_search_term_length, 1) + SiteSetting.refresh! - topic = Fabricate(:topic, title: 'My Title Discourse社區指南') - post = Fabricate(:post, topic: topic) + topic = Fabricate(:topic, title: 'My Title Discourse社區指南') + post = Fabricate(:post, topic: topic) - expect(Search.execute('社區指南').posts.first.id).to eq(post.id) - expect(Search.execute('指南').posts.first.id).to eq(post.id) + expect(Search.execute('社區指南').posts.first.id).to eq(post.id) + expect(Search.execute('指南').posts.first.id).to eq(post.id) + ensure + if default_min_search_term_length + SiteSetting.defaults.set_regardless_of_locale(:min_search_term_length, default_min_search_term_length) + SiteSetting.refresh! + end + end end end @@ -1818,27 +1887,6 @@ describe Search do end end - context 'CJK segmentation' do - before do - SiteSetting.search_tokenize_chinese_japanese_korean = true - SiteSetting.min_search_term_length = 1 - end - - let!(:post1) do - Fabricate(:post, raw: '場サアマネ織企ういかせ竹域ヱイマ穂基ホ神3予読ずねいぱ松査ス禁多サウ提懸イふ引小43改こょドめ。深とつぐ主思料農ぞかル者杯検める活分えほづぼ白犠') - end - - it('does not include superfluous spaces in blurbs') do - - results = Search.execute('ういかせ竹域', type_filter: 'topic') - expect(results.posts.length).to eq(1) - - expect(results.blurb(results.posts.first)).to include('ういかせ竹域') - - end - - end - context 'include_diacritics' do before { SiteSetting.search_ignore_accents = false } let!(:post1) { Fabricate(:post, raw: 'สวัสดี Régis hello') } diff --git a/spec/components/validators/search_tokenize_chinese_validator_spec.rb b/spec/components/validators/search_tokenize_chinese_validator_spec.rb new file mode 100644 index 00000000000..e0378145aed --- /dev/null +++ b/spec/components/validators/search_tokenize_chinese_validator_spec.rb @@ -0,0 +1,11 @@ +# frozen_string_literal: true + +require 'rails_helper' + +describe SearchTokenizeChineseValidator do + it 'does not allow search_tokenize_chinese to be enabled when search_tokenize_japanese is enabled' do + SiteSetting.search_tokenize_japanese = true + + expect { SiteSetting.search_tokenize_chinese = true }.to raise_error(Discourse::InvalidParameters) + end +end diff --git a/spec/components/validators/search_tokenize_japanese_validator_spec.rb b/spec/components/validators/search_tokenize_japanese_validator_spec.rb new file mode 100644 index 00000000000..dc65f4245aa --- /dev/null +++ b/spec/components/validators/search_tokenize_japanese_validator_spec.rb @@ -0,0 +1,11 @@ +# frozen_string_literal: true + +require 'rails_helper' + +describe SearchTokenizeJapaneseValidator do + it 'does not allow search_tokenize_japanese to be enabled when search_tokenize_chinese is enabled' do + SiteSetting.search_tokenize_chinese = true + + expect { SiteSetting.search_tokenize_japanese = true }.to raise_error(Discourse::InvalidParameters) + end +end diff --git a/spec/lib/search_spec.rb b/spec/lib/search_spec.rb index 1d10264f630..bfdfa7009e6 100644 --- a/spec/lib/search_spec.rb +++ b/spec/lib/search_spec.rb @@ -4,20 +4,6 @@ require 'rails_helper' describe Search do - context "#prepare_data" do - it "does not remove English stop words in mixed mode" do - SiteSetting.search_tokenize_chinese_japanese_korean = true - - tokenized = Search.prepare_data("monkey 吃香蕉 in a loud volume") - expect(tokenized).to eq("monkey 吃 香蕉 in a loud volume") - - SiteSetting.default_locale = 'zh_CN' - - tokenized = Search.prepare_data("monkey 吃香蕉 in a loud volume") - expect(tokenized).to eq("monkey 吃 香蕉 loud") - end - end - context "#ts_config" do it "maps locales to correct Postgres dictionaries" do expect(Search.ts_config).to eq("english") diff --git a/spec/lib/tiny_japanese_segmenter_spec.rb b/spec/lib/tiny_japanese_segmenter_spec.rb new file mode 100644 index 00000000000..2fc1d853f1a --- /dev/null +++ b/spec/lib/tiny_japanese_segmenter_spec.rb @@ -0,0 +1,13 @@ +# frozen_string_literal: true + +require 'rails_helper' + +describe TinyJapaneseSegmenter do + describe '.segment' do + it 'generates the segments for a given japanese text' do + expect(TinyJapaneseSegmenter.segment("TinySegmenterはJavascriptだけ書かれた極めてコンパクトな日本語分かち書きソフトウェアです。")).to eq( + %w{TinySegmenter は Javascript だけ 書か れ た 極め て コンパクト な 日本 語分 かち 書き ソフトウェア です 。} + ) + end + end +end