[Feature](inverted index) add unicode parser for inverted index (#21035)

This commit is contained in:
airborne12
2023-06-21 20:14:06 +08:00
committed by GitHub
parent cc53391c9a
commit 6ac0bfeceb
10 changed files with 46 additions and 32 deletions

View File

@ -27,6 +27,8 @@ std::string inverted_index_parser_type_to_string(InvertedIndexParserType parser_
return INVERTED_INDEX_PARSER_NONE;
case InvertedIndexParserType::PARSER_STANDARD:
return INVERTED_INDEX_PARSER_STANDARD;
case InvertedIndexParserType::PARSER_UNICODE:
return INVERTED_INDEX_PARSER_UNICODE;
case InvertedIndexParserType::PARSER_ENGLISH:
return INVERTED_INDEX_PARSER_ENGLISH;
case InvertedIndexParserType::PARSER_CHINESE:
@ -44,6 +46,8 @@ InvertedIndexParserType get_inverted_index_parser_type_from_string(const std::st
return InvertedIndexParserType::PARSER_NONE;
} else if (parser_str_lower == INVERTED_INDEX_PARSER_STANDARD) {
return InvertedIndexParserType::PARSER_STANDARD;
} else if (parser_str_lower == INVERTED_INDEX_PARSER_UNICODE) {
return InvertedIndexParserType::PARSER_UNICODE;
} else if (parser_str_lower == INVERTED_INDEX_PARSER_ENGLISH) {
return InvertedIndexParserType::PARSER_ENGLISH;
} else if (parser_str_lower == INVERTED_INDEX_PARSER_CHINESE) {
@ -67,7 +71,7 @@ std::string get_parser_mode_string_from_properties(
if (properties.find(INVERTED_INDEX_PARSER_MODE_KEY) != properties.end()) {
return properties.at(INVERTED_INDEX_PARSER_MODE_KEY);
} else {
return INVERTED_INDEX_PARSER_FINE_GRANULARITY;
return INVERTED_INDEX_PARSER_COARSE_GRANULARITY;
}
}

View File

@ -29,6 +29,7 @@ enum class InvertedIndexParserType {
PARSER_STANDARD = 2,
PARSER_ENGLISH = 3,
PARSER_CHINESE = 4,
PARSER_UNICODE = 5,
};
struct InvertedIndexCtx {
@ -46,6 +47,7 @@ const std::string INVERTED_INDEX_PARSER_KEY = "parser";
const std::string INVERTED_INDEX_PARSER_UNKNOWN = "unknown";
const std::string INVERTED_INDEX_PARSER_NONE = "none";
const std::string INVERTED_INDEX_PARSER_STANDARD = "standard";
const std::string INVERTED_INDEX_PARSER_UNICODE = "unicode";
const std::string INVERTED_INDEX_PARSER_ENGLISH = "english";
const std::string INVERTED_INDEX_PARSER_CHINESE = "chinese";

View File

@ -103,6 +103,11 @@ std::vector<std::wstring> InvertedIndexReader::get_analyse_result(
analyzer = std::make_shared<lucene::analysis::standard::StandardAnalyzer>();
reader.reset(
(new lucene::util::StringReader(std::wstring(value.begin(), value.end()).c_str())));
} else if (analyser_type == InvertedIndexParserType::PARSER_UNICODE) {
analyzer = std::make_shared<lucene::analysis::standard::StandardAnalyzer>();
reader.reset(new lucene::util::SimpleInputStreamReader(
new lucene::util::AStringReader(value.c_str()),
lucene::util::SimpleInputStreamReader::UTF8));
} else if (analyser_type == InvertedIndexParserType::PARSER_CHINESE) {
auto chinese_analyzer =
std::make_shared<lucene::analysis::LanguageBasedAnalyzer>(L"chinese", false);

View File

@ -154,7 +154,8 @@ public:
_doc = std::make_unique<lucene::document::Document>();
_dir.reset(DorisCompoundDirectory::getDirectory(_fs, index_path.c_str(), true));
if (_parser_type == InvertedIndexParserType::PARSER_STANDARD) {
if (_parser_type == InvertedIndexParserType::PARSER_STANDARD ||
_parser_type == InvertedIndexParserType::PARSER_UNICODE) {
_analyzer = std::make_unique<lucene::analysis::standard::StandardAnalyzer>();
} else if (_parser_type == InvertedIndexParserType::PARSER_ENGLISH) {
_analyzer = std::make_unique<lucene::analysis::SimpleAnalyzer<char>>();
@ -163,10 +164,10 @@ public:
chinese_analyzer->setLanguage(L"chinese");
chinese_analyzer->initDict(config::inverted_index_dict_path);
auto mode = get_parser_mode_string_from_properties(_index_meta->properties());
if (mode == INVERTED_INDEX_PARSER_COARSE_GRANULARITY) {
chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::Default);
} else {
if (mode == INVERTED_INDEX_PARSER_FINE_GRANULARITY) {
chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::All);
} else {
chinese_analyzer->setMode(lucene::analysis::AnalyzerMode::Default);
}
_analyzer.reset(chinese_analyzer);
} else {
@ -222,6 +223,11 @@ public:
if (_parser_type == InvertedIndexParserType::PARSER_ENGLISH ||
_parser_type == InvertedIndexParserType::PARSER_CHINESE) {
new_char_token_stream(field_value_data, field_value_size, _field);
} else if (_parser_type == InvertedIndexParserType::PARSER_UNICODE) {
auto stringReader = _CLNEW lucene::util::SimpleInputStreamReader(
new lucene::util::AStringReader(field_value_data, field_value_size),
lucene::util::SimpleInputStreamReader::UTF8);
_field->setValue(stringReader);
} else {
new_field_value(field_value_data, field_value_size, _field);
}

View File

@ -52,7 +52,7 @@ The features for inverted index is as follows:
- add fulltext search on text(string, varchar, char) field
- MATCH_ALL matches all keywords, MATCH_ANY matches any keywords
- support fulltext on array of text field
- support english and chinese word parser
- support english, chinese and mixed unicode word parser
- accelerate normal equal, range query, replacing bitmap index in the future
- suport =, !=, >, >=, <, <= on text, numeric, datetime types
- suport =, !=, >, >=, <, <= on array of text, numeric, datetime types
@ -74,10 +74,12 @@ The features for inverted index is as follows:
- missing stands for no parser, the whole field is considered to be a term
- "english" stands for english parser
- "chinese" stands for chinese parser
- "unicode" stands for mixed-type word segmentation suitable for situations with a mix of Chinese and English. It can segment email prefixes and suffixes, IP addresses, and mixed characters and numbers, and can also segment Chinese characters into 1-gram.
- "parser_mode" is utilized to set the tokenizer/parser type for Chinese word segmentation.
- in "fine_grained" mode, the system will meticulously tokenize each possible segment.
- in "coarse_grained" mode, the system follows the maximization principle, performing accurate and comprehensive tokenization.
- default mode is "fine_grained".
- default mode is "coarse_grained".
- "support_phrase" is utilized to specify if the index requires support for phrase mode.
- "true" indicates that support is needed.
- "false" indicates that support is not needed.
@ -88,10 +90,10 @@ The features for inverted index is as follows:
CREATE TABLE table_name
(
columns_difinition,
INDEX idx_name1(column_name1) USING INVERTED [PROPERTIES("parser" = "english|chinese")] [COMMENT 'your comment']
INDEX idx_name2(column_name2) USING INVERTED [PROPERTIES("parser" = "english|chinese")] [COMMENT 'your comment']
INDEX idx_name1(column_name1) USING INVERTED [PROPERTIES("parser" = "english|chinese|unicode")] [COMMENT 'your comment']
INDEX idx_name2(column_name2) USING INVERTED [PROPERTIES("parser" = "english|chinese|unicode")] [COMMENT 'your comment']
INDEX idx_name3(column_name3) USING INVERTED [PROPERTIES("parser" = "chinese", "parser_mode" = "fine_grained|coarse_grained")] [COMMENT 'your comment']
INDEX idx_name4(column_name4) USING INVERTED [PROPERTIES("parser" = "english|chinese", "support_phrase" = "true|false")] [COMMENT 'your comment']
INDEX idx_name4(column_name4) USING INVERTED [PROPERTIES("parser" = "english|chinese|unicode", "support_phrase" = "true|false")] [COMMENT 'your comment']
)
table_properties;
```
@ -99,9 +101,9 @@ table_properties;
- add an inverted index to existed table
```sql
-- syntax 1
CREATE INDEX idx_name ON table_name(column_name) USING INVERTED [PROPERTIES("parser" = "english|chinese")] [COMMENT 'your comment'];
CREATE INDEX idx_name ON table_name(column_name) USING INVERTED [PROPERTIES("parser" = "english|chinese|unicode")] [COMMENT 'your comment'];
-- syntax 2
ALTER TABLE table_name ADD INDEX idx_name(column_name) USING INVERTED [PROPERTIES("parser" = "english|chinese")] [COMMENT 'your comment'];
ALTER TABLE table_name ADD INDEX idx_name(column_name) USING INVERTED [PROPERTIES("parser" = "english|chinese|unicode")] [COMMENT 'your comment'];
```
- drop an inverted index

View File

@ -52,7 +52,7 @@ Doris倒排索引的功能简要介绍如下:
- 增加了字符串类型的全文检索
- 支持字符串全文检索,包括同时匹配多个关键字MATCH_ALL、匹配任意一个关键字MATCH_ANY、匹配短语词组MATCH_PHRASE
- 支持字符串数组类型的全文检索
- 支持英文、中文分词
- 支持英文、中文以及混合类型分词
- 加速普通等值、范围查询,覆盖bitmap索引的功能,未来会代替bitmap索引
- 支持字符串、数值、日期时间类型的 =, !=, >, >=, <, <= 快速过滤
- 支持字符串、数字、日期时间数组类型的 =, !=, >, >=, <, <=
@ -72,11 +72,12 @@ Doris倒排索引的功能简要介绍如下:
- parser指定分词器
- 默认不指定代表不分词
- english是英文分词,适合被索引列是英文的情况,用空格和标点符号分词,性能高
- chinese是中文分词,适合被索引列有中文或者中英文混合的情况,采用jieba分词库,性能比english分词低
- chinese是中文分词,适合被索引列有中文或者中英文混合的情况,性能比english分词低
- unicode是混合类型分词,适用于中英文混合的情况。它能够对邮箱前缀和后缀、IP地址以及字符数字混合进行分词,并且可以对中文字符进行1-gram分词。
- parser_mode用于指定中文分词的模式
- fine_grained模式,系统将对可以进行分词的部分都进行详尽的分词处理
- coarse_grained模式,系统则依据最大化原则,执行精确且全面的分词操作
- 默认find_grained模式
- 默认coarse_grained模式
- support_phrase用于指定索引是否需要支持短语模式
- true为需要
- false为不需要
@ -87,10 +88,10 @@ Doris倒排索引的功能简要介绍如下:
CREATE TABLE table_name
(
columns_difinition,
INDEX idx_name1(column_name1) USING INVERTED [PROPERTIES("parser" = "english|chinese")] [COMMENT 'your comment']
INDEX idx_name2(column_name2) USING INVERTED [PROPERTIES("parser" = "english|chinese")] [COMMENT 'your comment']
INDEX idx_name1(column_name1) USING INVERTED [PROPERTIES("parser" = "english|unicode|chinese")] [COMMENT 'your comment']
INDEX idx_name2(column_name2) USING INVERTED [PROPERTIES("parser" = "english|unicode|chinese")] [COMMENT 'your comment']
INDEX idx_name3(column_name3) USING INVERTED [PROPERTIES("parser" = "chinese", "parser_mode" = "fine_grained|coarse_grained")] [COMMENT 'your comment']
INDEX idx_name4(column_name4) USING INVERTED [PROPERTIES("parser" = "english|chinese", "support_phrase" = "true|false")] [COMMENT 'your comment']
INDEX idx_name4(column_name4) USING INVERTED [PROPERTIES("parser" = "english|unicode|chinese", "support_phrase" = "true|false")] [COMMENT 'your comment']
)
table_properties;
```
@ -98,9 +99,9 @@ table_properties;
- 已有表增加倒排索引
```sql
-- 语法1
CREATE INDEX idx_name ON table_name(column_name) USING INVERTED [PROPERTIES("parser" = "english|chinese")] [COMMENT 'your comment'];
CREATE INDEX idx_name ON table_name(column_name) USING INVERTED [PROPERTIES("parser" = "english|unicode|chinese")] [COMMENT 'your comment'];
-- 语法2
ALTER TABLE table_name ADD INDEX idx_name(column_name) USING INVERTED [PROPERTIES("parser" = "english|chinese")] [COMMENT 'your comment'];
ALTER TABLE table_name ADD INDEX idx_name(column_name) USING INVERTED [PROPERTIES("parser" = "english|unicode|chinese")] [COMMENT 'your comment'];
```
- 删除倒排索引
@ -149,7 +150,7 @@ USE test_inverted_index;
-- 创建表的同时创建了comment的倒排索引idx_comment
-- USING INVERTED 指定索引类型是倒排索引
-- PROPERTIES("parser" = "english") 指定采用english分词,还支持"chinese"中文分词,如果不指定"parser"参数表示不分词
-- PROPERTIES("parser" = "english") 指定采用english分词,还支持"chinese"中文分词和"unicode"中英文混合分词,如果不指定"parser"参数表示不分词
CREATE TABLE hackernews_1m
(
`id` BIGINT,

View File

@ -28,6 +28,7 @@ public class InvertedIndexUtil {
public static String INVERTED_INDEX_PARSER_UNKNOWN = "unknown";
public static String INVERTED_INDEX_PARSER_NONE = "none";
public static String INVERTED_INDEX_PARSER_STANDARD = "standard";
public static String INVERTED_INDEX_PARSER_UNICODE = "unicode";
public static String INVERTED_INDEX_PARSER_ENGLISH = "english";
public static String INVERTED_INDEX_PARSER_CHINESE = "chinese";
@ -53,8 +54,9 @@ public class InvertedIndexUtil {
if (colType.isStringType()) {
if (!(parser.equals(INVERTED_INDEX_PARSER_NONE)
|| parser.equals(INVERTED_INDEX_PARSER_STANDARD)
|| parser.equals(INVERTED_INDEX_PARSER_ENGLISH)
|| parser.equals(INVERTED_INDEX_PARSER_CHINESE))) {
|| parser.equals(INVERTED_INDEX_PARSER_UNICODE)
|| parser.equals(INVERTED_INDEX_PARSER_ENGLISH)
|| parser.equals(INVERTED_INDEX_PARSER_CHINESE))) {
throw new AnalysisException("INVERTED index parser: " + parser
+ " is invalid for column: " + indexColName + " of type " + colType);
}

View File

@ -11,18 +11,12 @@
-- !sql --
1 我来到北京清华大学
-- !sql --
1 我来到北京清华大学
-- !sql --
3 人民可以得到更多实惠
-- !sql --
2 我爱你中国
-- !sql --
1 我来到北京清华大学
-- !sql --
-- !sql --

View File

@ -48,7 +48,6 @@ suite("test_chinese_analyzer"){
sql "INSERT INTO $indexTblName VALUES (1, '我来到北京清华大学'), (2, '我爱你中国'), (3, '人民可以得到更多实惠');"
qt_sql "SELECT * FROM $indexTblName WHERE c MATCH '我爱你' ORDER BY id;"
qt_sql "SELECT * FROM $indexTblName WHERE c MATCH '我' ORDER BY id;"
qt_sql "SELECT * FROM $indexTblName WHERE c MATCH '清华' ORDER BY id;"
qt_sql "SELECT * FROM $indexTblName WHERE c MATCH '大学' ORDER BY id;"
qt_sql "SELECT * FROM $indexTblName WHERE c MATCH '清华大学' ORDER BY id;"
@ -74,7 +73,6 @@ suite("test_chinese_analyzer"){
sql "INSERT INTO $indexTblName2 VALUES (1, '我来到北京清华大学'), (2, '我爱你中国'), (3, '人民可以得到更多实惠');"
qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH '我爱你' ORDER BY id;"
qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH '我' ORDER BY id;"
qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH '清华' ORDER BY id;"
qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH '大学' ORDER BY id;"
qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH '清华大学' ORDER BY id;"