[Feature](inverted index) add unicode parser for inverted index (#21035)
This commit is contained in:
Submodule be/src/clucene updated: 60f5eab7ac...103e88a8a3
@ -27,6 +27,8 @@ std::string inverted_index_parser_type_to_string(InvertedIndexParserType parser_
|
||||
return INVERTED_INDEX_PARSER_NONE;
|
||||
case InvertedIndexParserType::PARSER_STANDARD:
|
||||
return INVERTED_INDEX_PARSER_STANDARD;
|
||||
case InvertedIndexParserType::PARSER_UNICODE:
|
||||
return INVERTED_INDEX_PARSER_UNICODE;
|
||||
case InvertedIndexParserType::PARSER_ENGLISH:
|
||||
return INVERTED_INDEX_PARSER_ENGLISH;
|
||||
case InvertedIndexParserType::PARSER_CHINESE:
|
||||
@ -44,6 +46,8 @@ InvertedIndexParserType get_inverted_index_parser_type_from_string(const std::st
|
||||
return InvertedIndexParserType::PARSER_NONE;
|
||||
} else if (parser_str_lower == INVERTED_INDEX_PARSER_STANDARD) {
|
||||
return InvertedIndexParserType::PARSER_STANDARD;
|
||||
} else if (parser_str_lower == INVERTED_INDEX_PARSER_UNICODE) {
|
||||
return InvertedIndexParserType::PARSER_UNICODE;
|
||||
} else if (parser_str_lower == INVERTED_INDEX_PARSER_ENGLISH) {
|
||||
return InvertedIndexParserType::PARSER_ENGLISH;
|
||||
} else if (parser_str_lower == INVERTED_INDEX_PARSER_CHINESE) {
|
||||
@ -67,7 +71,7 @@ std::string get_parser_mode_string_from_properties(
|
||||
if (properties.find(INVERTED_INDEX_PARSER_MODE_KEY) != properties.end()) {
|
||||
return properties.at(INVERTED_INDEX_PARSER_MODE_KEY);
|
||||
} else {
|
||||
return INVERTED_INDEX_PARSER_FINE_GRANULARITY;
|
||||
return INVERTED_INDEX_PARSER_COARSE_GRANULARITY;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -29,6 +29,7 @@ enum class InvertedIndexParserType {
|
||||
PARSER_STANDARD = 2,
|
||||
PARSER_ENGLISH = 3,
|
||||
PARSER_CHINESE = 4,
|
||||
PARSER_UNICODE = 5,
|
||||
};
|
||||
|
||||
struct InvertedIndexCtx {
|
||||
@ -46,6 +47,7 @@ const std::string INVERTED_INDEX_PARSER_KEY = "parser";
|
||||
const std::string INVERTED_INDEX_PARSER_UNKNOWN = "unknown";
|
||||
const std::string INVERTED_INDEX_PARSER_NONE = "none";
|
||||
const std::string INVERTED_INDEX_PARSER_STANDARD = "standard";
|
||||
const std::string INVERTED_INDEX_PARSER_UNICODE = "unicode";
|
||||
const std::string INVERTED_INDEX_PARSER_ENGLISH = "english";
|
||||
const std::string INVERTED_INDEX_PARSER_CHINESE = "chinese";
|
||||
|
||||
|
||||
@ -103,6 +103,11 @@ std::vector<std::wstring> InvertedIndexReader::get_analyse_result(
|
||||
analyzer = std::make_shared<lucene::analysis::standard::StandardAnalyzer>();
|
||||
reader.reset(
|
||||
(new lucene::util::StringReader(std::wstring(value.begin(), value.end()).c_str())));
|
||||
} else if (analyser_type == InvertedIndexParserType::PARSER_UNICODE) {
|
||||
analyzer = std::make_shared<lucene::analysis::standard::StandardAnalyzer>();
|
||||
reader.reset(new lucene::util::SimpleInputStreamReader(
|
||||
new lucene::util::AStringReader(value.c_str()),
|
||||
lucene::util::SimpleInputStreamReader::UTF8));
|
||||
} else if (analyser_type == InvertedIndexParserType::PARSER_CHINESE) {
|
||||
auto chinese_analyzer =
|
||||
std::make_shared<lucene::analysis::LanguageBasedAnalyzer>(L"chinese", false);
|
||||
|
||||
@ -154,7 +154,8 @@ public:
|
||||
_doc = std::make_unique<lucene::document::Document>();
|
||||
_dir.reset(DorisCompoundDirectory::getDirectory(_fs, index_path.c_str(), true));
|
||||
|
||||
if (_parser_type == InvertedIndexParserType::PARSER_STANDARD) {
|
||||
if (_parser_type == InvertedIndexParserType::PARSER_STANDARD ||
|
||||
_parser_type == InvertedIndexParserType::PARSER_UNICODE) {
|
||||
_analyzer = std::make_unique<lucene::analysis::standard::StandardAnalyzer>();
|
||||
} else if (_parser_type == InvertedIndexParserType::PARSER_ENGLISH) {
|
||||
_analyzer = std::make_unique<lucene::analysis::SimpleAnalyzer<char>>();
|
||||
@ -163,10 +164,10 @@ public:
|
||||
chinese_analyzer->setLanguage(L"chinese");
|
||||
chinese_analyzer->initDict(config::inverted_index_dict_path);
|
||||
auto mode = get_parser_mode_string_from_properties(_index_meta->properties());
|
||||
if (mode == INVERTED_INDEX_PARSER_COARSE_GRANULARITY) {
|
||||
chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::Default);
|
||||
} else {
|
||||
if (mode == INVERTED_INDEX_PARSER_FINE_GRANULARITY) {
|
||||
chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::All);
|
||||
} else {
|
||||
chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::Default);
|
||||
}
|
||||
_analyzer.reset(chinese_analyzer);
|
||||
} else {
|
||||
@ -222,6 +223,11 @@ public:
|
||||
if (_parser_type == InvertedIndexParserType::PARSER_ENGLISH ||
|
||||
_parser_type == InvertedIndexParserType::PARSER_CHINESE) {
|
||||
new_char_token_stream(field_value_data, field_value_size, _field);
|
||||
} else if (_parser_type == InvertedIndexParserType::PARSER_UNICODE) {
|
||||
auto stringReader = _CLNEW lucene::util::SimpleInputStreamReader(
|
||||
new lucene::util::AStringReader(field_value_data, field_value_size),
|
||||
lucene::util::SimpleInputStreamReader::UTF8);
|
||||
_field->setValue(stringReader);
|
||||
} else {
|
||||
new_field_value(field_value_data, field_value_size, _field);
|
||||
}
|
||||
|
||||
@ -52,7 +52,7 @@ The features for inverted index is as follows:
|
||||
- add fulltext search on text(string, varchar, char) field
|
||||
- MATCH_ALL matches all keywords, MATCH_ANY matches any keywords
|
||||
- support fulltext on array of text field
|
||||
- support english and chinese word parser
|
||||
- support english, chinese and mixed unicode word parser
|
||||
- accelerate normal equal, range query, replacing bitmap index in the future
|
||||
- suport =, !=, >, >=, <, <= on text, numeric, datetime types
|
||||
- suport =, !=, >, >=, <, <= on array of text, numeric, datetime types
|
||||
@ -74,10 +74,12 @@ The features for inverted index is as follows:
|
||||
- missing stands for no parser, the whole field is considered to be a term
|
||||
- "english" stands for english parser
|
||||
- "chinese" stands for chinese parser
|
||||
- "unicode" stands for mixed-type word segmentation suitable for situations with a mix of Chinese and English. It can segment email prefixes and suffixes, IP addresses, and mixed characters and numbers, and can also segment Chinese characters into 1-gram.
|
||||
|
||||
- "parser_mode" is utilized to set the tokenizer/parser type for Chinese word segmentation.
|
||||
- in "fine_grained" mode, the system will meticulously tokenize each possible segment.
|
||||
- in "coarse_grained" mode, the system follows the maximization principle, performing accurate and comprehensive tokenization.
|
||||
- default mode is "fine_grained".
|
||||
- default mode is "coarse_grained".
|
||||
- "support_phrase" is utilized to specify if the index requires support for phrase mode.
|
||||
- "true" indicates that support is needed.
|
||||
- "false" indicates that support is not needed.
|
||||
@ -88,10 +90,10 @@ The features for inverted index is as follows:
|
||||
CREATE TABLE table_name
|
||||
(
|
||||
columns_difinition,
|
||||
INDEX idx_name1(column_name1) USING INVERTED [PROPERTIES("parser" = "english|chinese")] [COMMENT 'your comment']
|
||||
INDEX idx_name2(column_name2) USING INVERTED [PROPERTIES("parser" = "english|chinese")] [COMMENT 'your comment']
|
||||
INDEX idx_name1(column_name1) USING INVERTED [PROPERTIES("parser" = "english|chinese|unicode")] [COMMENT 'your comment']
|
||||
INDEX idx_name2(column_name2) USING INVERTED [PROPERTIES("parser" = "english|chinese|unicode")] [COMMENT 'your comment']
|
||||
INDEX idx_name3(column_name3) USING INVERTED [PROPERTIES("parser" = "chinese", "parser_mode" = "fine_grained|coarse_grained")] [COMMENT 'your comment']
|
||||
INDEX idx_name4(column_name4) USING INVERTED [PROPERTIES("parser" = "english|chinese", "support_phrase" = "true|false")] [COMMENT 'your comment']
|
||||
INDEX idx_name4(column_name4) USING INVERTED [PROPERTIES("parser" = "english|chinese|unicode", "support_phrase" = "true|false")] [COMMENT 'your comment']
|
||||
)
|
||||
table_properties;
|
||||
```
|
||||
@ -99,9 +101,9 @@ table_properties;
|
||||
- add an inverted index to existed table
|
||||
```sql
|
||||
-- syntax 1
|
||||
CREATE INDEX idx_name ON table_name(column_name) USING INVERTED [PROPERTIES("parser" = "english|chinese")] [COMMENT 'your comment'];
|
||||
CREATE INDEX idx_name ON table_name(column_name) USING INVERTED [PROPERTIES("parser" = "english|chinese|unicode")] [COMMENT 'your comment'];
|
||||
-- syntax 2
|
||||
ALTER TABLE table_name ADD INDEX idx_name(column_name) USING INVERTED [PROPERTIES("parser" = "english|chinese")] [COMMENT 'your comment'];
|
||||
ALTER TABLE table_name ADD INDEX idx_name(column_name) USING INVERTED [PROPERTIES("parser" = "english|chinese|unicode")] [COMMENT 'your comment'];
|
||||
```
|
||||
|
||||
- drop an inverted index
|
||||
|
||||
@ -52,7 +52,7 @@ Doris倒排索引的功能简要介绍如下:
|
||||
- 增加了字符串类型的全文检索
|
||||
- 支持字符串全文检索,包括同时匹配多个关键字MATCH_ALL、匹配任意一个关键字MATCH_ANY、匹配短语词组MATCH_PHRASE
|
||||
- 支持字符串数组类型的全文检索
|
||||
- 支持英文、中文分词
|
||||
- 支持英文、中文以及混合类型分词
|
||||
- 加速普通等值、范围查询,覆盖bitmap索引的功能,未来会代替bitmap索引
|
||||
- 支持字符串、数值、日期时间类型的 =, !=, >, >=, <, <= 快速过滤
|
||||
- 支持字符串、数字、日期时间数组类型的 =, !=, >, >=, <, <=
|
||||
@ -72,11 +72,12 @@ Doris倒排索引的功能简要介绍如下:
|
||||
- parser指定分词器
|
||||
- 默认不指定代表不分词
|
||||
- english是英文分词,适合被索引列是英文的情况,用空格和标点符号分词,性能高
|
||||
- chinese是中文分词,适合被索引列有中文或者中英文混合的情况,采用jieba分词库,性能比english分词低
|
||||
- chinese是中文分词,适合被索引列有中文或者中英文混合的情况,性能比english分词低
|
||||
- unicode是混合类型分词,适用于中英文混合的情况。它能够对邮箱前缀和后缀、IP地址以及字符数字混合进行分词,并且可以对中文字符进行1-gram分词。
|
||||
- parser_mode用于指定中文分词的模式
|
||||
- fine_grained模式,系统将对可以进行分词的部分都进行详尽的分词处理
|
||||
- coarse_grained模式,系统则依据最大化原则,执行精确且全面的分词操作
|
||||
- 默认find_grained模式
|
||||
- 默认coarse_grained模式
|
||||
- support_phrase用于指定索引是否需要支持短语模式
|
||||
- true为需要
|
||||
- false为不需要
|
||||
@ -87,10 +88,10 @@ Doris倒排索引的功能简要介绍如下:
|
||||
CREATE TABLE table_name
|
||||
(
|
||||
columns_difinition,
|
||||
INDEX idx_name1(column_name1) USING INVERTED [PROPERTIES("parser" = "english|chinese")] [COMMENT 'your comment']
|
||||
INDEX idx_name2(column_name2) USING INVERTED [PROPERTIES("parser" = "english|chinese")] [COMMENT 'your comment']
|
||||
INDEX idx_name1(column_name1) USING INVERTED [PROPERTIES("parser" = "english|unicode|chinese")] [COMMENT 'your comment']
|
||||
INDEX idx_name2(column_name2) USING INVERTED [PROPERTIES("parser" = "english|unicode|chinese")] [COMMENT 'your comment']
|
||||
INDEX idx_name3(column_name3) USING INVERTED [PROPERTIES("parser" = "chinese", "parser_mode" = "fine_grained|coarse_grained")] [COMMENT 'your comment']
|
||||
INDEX idx_name4(column_name4) USING INVERTED [PROPERTIES("parser" = "english|chinese", "support_phrase" = "true|false")] [COMMENT 'your comment']
|
||||
INDEX idx_name4(column_name4) USING INVERTED [PROPERTIES("parser" = "english|unicode|chinese", "support_phrase" = "true|false")] [COMMENT 'your comment']
|
||||
)
|
||||
table_properties;
|
||||
```
|
||||
@ -98,9 +99,9 @@ table_properties;
|
||||
- 已有表增加倒排索引
|
||||
```sql
|
||||
-- 语法1
|
||||
CREATE INDEX idx_name ON table_name(column_name) USING INVERTED [PROPERTIES("parser" = "english|chinese")] [COMMENT 'your comment'];
|
||||
CREATE INDEX idx_name ON table_name(column_name) USING INVERTED [PROPERTIES("parser" = "english|unicode|chinese")] [COMMENT 'your comment'];
|
||||
-- 语法2
|
||||
ALTER TABLE table_name ADD INDEX idx_name(column_name) USING INVERTED [PROPERTIES("parser" = "english|chinese")] [COMMENT 'your comment'];
|
||||
ALTER TABLE table_name ADD INDEX idx_name(column_name) USING INVERTED [PROPERTIES("parser" = "english|unicode|chinese")] [COMMENT 'your comment'];
|
||||
```
|
||||
|
||||
- 删除倒排索引
|
||||
@ -149,7 +150,7 @@ USE test_inverted_index;
|
||||
|
||||
-- 创建表的同时创建了comment的倒排索引idx_comment
|
||||
-- USING INVERTED 指定索引类型是倒排索引
|
||||
-- PROPERTIES("parser" = "english") 指定采用english分词,还支持"chinese"中文分词,如果不指定"parser"参数表示不分词
|
||||
-- PROPERTIES("parser" = "english") 指定采用english分词,还支持"chinese"中文分词和"unicode"中英文混合分词,如果不指定"parser"参数表示不分词
|
||||
CREATE TABLE hackernews_1m
|
||||
(
|
||||
`id` BIGINT,
|
||||
|
||||
@ -28,6 +28,7 @@ public class InvertedIndexUtil {
|
||||
public static String INVERTED_INDEX_PARSER_UNKNOWN = "unknown";
|
||||
public static String INVERTED_INDEX_PARSER_NONE = "none";
|
||||
public static String INVERTED_INDEX_PARSER_STANDARD = "standard";
|
||||
public static String INVERTED_INDEX_PARSER_UNICODE = "unicode";
|
||||
public static String INVERTED_INDEX_PARSER_ENGLISH = "english";
|
||||
public static String INVERTED_INDEX_PARSER_CHINESE = "chinese";
|
||||
|
||||
@ -53,8 +54,9 @@ public class InvertedIndexUtil {
|
||||
if (colType.isStringType()) {
|
||||
if (!(parser.equals(INVERTED_INDEX_PARSER_NONE)
|
||||
|| parser.equals(INVERTED_INDEX_PARSER_STANDARD)
|
||||
|| parser.equals(INVERTED_INDEX_PARSER_ENGLISH)
|
||||
|| parser.equals(INVERTED_INDEX_PARSER_CHINESE))) {
|
||||
|| parser.equals(INVERTED_INDEX_PARSER_UNICODE)
|
||||
|| parser.equals(INVERTED_INDEX_PARSER_ENGLISH)
|
||||
|| parser.equals(INVERTED_INDEX_PARSER_CHINESE))) {
|
||||
throw new AnalysisException("INVERTED index parser: " + parser
|
||||
+ " is invalid for column: " + indexColName + " of type " + colType);
|
||||
}
|
||||
|
||||
@ -11,18 +11,12 @@
|
||||
-- !sql --
|
||||
1 我来到北京清华大学
|
||||
|
||||
-- !sql --
|
||||
1 我来到北京清华大学
|
||||
|
||||
-- !sql --
|
||||
3 人民可以得到更多实惠
|
||||
|
||||
-- !sql --
|
||||
2 我爱你中国
|
||||
|
||||
-- !sql --
|
||||
1 我来到北京清华大学
|
||||
|
||||
-- !sql --
|
||||
|
||||
-- !sql --
|
||||
|
||||
@ -48,7 +48,6 @@ suite("test_chinese_analyzer"){
|
||||
|
||||
sql "INSERT INTO $indexTblName VALUES (1, '我来到北京清华大学'), (2, '我爱你中国'), (3, '人民可以得到更多实惠');"
|
||||
qt_sql "SELECT * FROM $indexTblName WHERE c MATCH '我爱你' ORDER BY id;"
|
||||
qt_sql "SELECT * FROM $indexTblName WHERE c MATCH '我' ORDER BY id;"
|
||||
qt_sql "SELECT * FROM $indexTblName WHERE c MATCH '清华' ORDER BY id;"
|
||||
qt_sql "SELECT * FROM $indexTblName WHERE c MATCH '大学' ORDER BY id;"
|
||||
qt_sql "SELECT * FROM $indexTblName WHERE c MATCH '清华大学' ORDER BY id;"
|
||||
@ -74,7 +73,6 @@ suite("test_chinese_analyzer"){
|
||||
|
||||
sql "INSERT INTO $indexTblName2 VALUES (1, '我来到北京清华大学'), (2, '我爱你中国'), (3, '人民可以得到更多实惠');"
|
||||
qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH '我爱你' ORDER BY id;"
|
||||
qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH '我' ORDER BY id;"
|
||||
qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH '清华' ORDER BY id;"
|
||||
qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH '大学' ORDER BY id;"
|
||||
qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH '清华大学' ORDER BY id;"
|
||||
|
||||
Reference in New Issue
Block a user