From 0cd51835561dea334d59d823011d9b6ec86906a3 Mon Sep 17 00:00:00 2001 From: airborne12 Date: Wed, 2 Aug 2023 19:12:22 +0800 Subject: [PATCH] [Refactor](inverted index) refact tokenize function for inverted index (#22313) --- be/src/vec/functions/function_tokenize.cpp | 81 ++++++++++++------- be/src/vec/functions/function_tokenize.h | 7 +- .../inverted_index_p0/test_array_index.out | 1 + .../data/inverted_index_p0/test_tokenize.out | 6 ++ .../inverted_index_p0/test_tokenize.groovy | 27 ++++++- 5 files changed, 89 insertions(+), 33 deletions(-) diff --git a/be/src/vec/functions/function_tokenize.cpp b/be/src/vec/functions/function_tokenize.cpp index fb311ca627..c7764bcf49 100644 --- a/be/src/vec/functions/function_tokenize.cpp +++ b/be/src/vec/functions/function_tokenize.cpp @@ -25,7 +25,6 @@ #include "CLucene/StdHeader.h" #include "CLucene/config/repl_wchar.h" #include "olap/inverted_index_parser.h" -#include "olap/rowset/segment_v2/inverted_index_reader.h" #include "vec/columns/column.h" #include "vec/common/string_ref.h" #include "vec/core/block.h" @@ -37,11 +36,49 @@ namespace doris::vectorized { -void FunctionTokenize::_execute_constant(const ColumnString& src_column_string, - const StringRef& tokenize_type, - IColumn& dest_nested_column, - ColumnArray::Offsets64& dest_offsets, - NullMapType* dest_nested_null_map) { +Status parse(const std::string& str, std::map& result) { + std::string::size_type start = 0; + + while (start < str.size()) { + std::string::size_type end = str.find(',', start); + std::string pair = + (end == std::string::npos) ? str.substr(start) : str.substr(start, end - start); + + std::string::size_type eq_pos = pair.find('='); + if (eq_pos == std::string::npos) { + return Status::InvalidArgument( + fmt::format("invalid params {} for function tokenize", str)); + } + std::string key = pair.substr(0, eq_pos); + key = key.substr(key.find_first_not_of(" '\"" + "\t\n\r"), + key.find_last_not_of(" '\"" + "\t\n\r") - + key.find_first_not_of(" '\"" + "\t\n\r") + + 1); + std::string value = pair.substr(eq_pos + 1); + value = value.substr(value.find_first_not_of(" '\"" + "\t\n\r"), + value.find_last_not_of(" '\"" + "\t\n\r") - + value.find_first_not_of(" '\"" + "\t\n\r") + + 1); + + result[key] = value; + + start = (end == std::string::npos) ? str.size() : end + 1; + } + + return Status::OK(); +} + +void FunctionTokenize::_do_tokenize(const ColumnString& src_column_string, + InvertedIndexCtx& inverted_index_ctx, + IColumn& dest_nested_column, + ColumnArray::Offsets64& dest_offsets, + NullMapType* dest_nested_null_map) { ColumnString& dest_column_string = reinterpret_cast(dest_nested_column); ColumnString::Chars& column_string_chars = dest_column_string.get_chars(); ColumnString::Offsets& column_string_offsets = dest_column_string.get_offsets(); @@ -51,25 +88,6 @@ void FunctionTokenize::_execute_constant(const ColumnString& src_column_string, ColumnArray::Offset64 dest_pos = 0; ColumnArray::Offset64 src_offsets_size = src_column_string.get_offsets().size(); - InvertedIndexCtx inverted_index_ctx; - auto parser_type = get_inverted_index_parser_type_from_string(tokenize_type.to_string()); - - switch (parser_type) { - case InvertedIndexParserType::PARSER_CHINESE: { - // we don't support parse_mode params now, so make it default. - inverted_index_ctx.parser_mode = INVERTED_INDEX_PARSER_COARSE_GRANULARITY; - inverted_index_ctx.parser_type = parser_type; - break; - } - case InvertedIndexParserType::PARSER_UNICODE: { - inverted_index_ctx.parser_type = parser_type; - break; - } - default: - // default as english - inverted_index_ctx.parser_type = InvertedIndexParserType::PARSER_ENGLISH; - } - for (size_t i = 0; i < src_offsets_size; i++) { const StringRef tokenize_str = src_column_string.get_data_at(i); @@ -129,8 +147,17 @@ Status FunctionTokenize::execute_impl(FunctionContext* /*context*/, Block& block if (auto col_left = check_and_get_column(src_column.get())) { if (auto col_right = check_and_get_column(right_column.get())) { - _execute_constant(*col_left, col_right->get_data_at(0), *dest_nested_column, - dest_offsets, dest_nested_null_map); + InvertedIndexCtx inverted_index_ctx; + std::map properties; + auto st = parse(col_right->get_data_at(0).to_string(), properties); + if (!st.ok()) { + return st; + } + inverted_index_ctx.parser_type = get_inverted_index_parser_type_from_string( + get_parser_string_from_properties(properties)); + inverted_index_ctx.parser_mode = get_parser_mode_string_from_properties(properties); + _do_tokenize(*col_left, inverted_index_ctx, *dest_nested_column, dest_offsets, + dest_nested_null_map); block.replace_by_position(result, std::move(dest_column_ptr)); return Status::OK(); diff --git a/be/src/vec/functions/function_tokenize.h b/be/src/vec/functions/function_tokenize.h index a3145c58c8..20b4f74284 100644 --- a/be/src/vec/functions/function_tokenize.h +++ b/be/src/vec/functions/function_tokenize.h @@ -24,6 +24,7 @@ #include #include "common/status.h" +#include "olap/rowset/segment_v2/inverted_index_reader.h" #include "udf/udf.h" #include "vec/columns/column_array.h" #include "vec/core/column_numbers.h" @@ -64,9 +65,9 @@ public: << " and arguments[1] is " << arguments[1]->get_name(); return std::make_shared(make_nullable(arguments[0])); } - void _execute_constant(const ColumnString& src_column_string, const StringRef& delimiter_ref, - IColumn& dest_nested_column, ColumnArray::Offsets64& dest_offsets, - NullMapType* dest_nested_null_map); + void _do_tokenize(const ColumnString& src_column_string, InvertedIndexCtx& inverted_index_ctx, + IColumn& dest_nested_column, ColumnArray::Offsets64& dest_offsets, + NullMapType* dest_nested_null_map); Status execute_impl(FunctionContext* /*context*/, Block& block, const ColumnNumbers& arguments, size_t result, size_t /*input_rows_count*/) override; diff --git a/regression-test/data/inverted_index_p0/test_array_index.out b/regression-test/data/inverted_index_p0/test_array_index.out index 0d7529c8b6..c23617fbdb 100644 --- a/regression-test/data/inverted_index_p0/test_array_index.out +++ b/regression-test/data/inverted_index_p0/test_array_index.out @@ -68,3 +68,4 @@ -- !sql -- 2 [20, 30, 40] ["i", "love", "north korea"] + diff --git a/regression-test/data/inverted_index_p0/test_tokenize.out b/regression-test/data/inverted_index_p0/test_tokenize.out index b99b79435b..731ae4249e 100644 --- a/regression-test/data/inverted_index_p0/test_tokenize.out +++ b/regression-test/data/inverted_index_p0/test_tokenize.out @@ -13,3 +13,9 @@ ["人", "民", "可", "以", "得", "到", "更", "多", "实", "惠"] ["陕", "西", "省", "西", "安", "市", "高", "新", "区", "创", "业", "大", "厦", "座", "我", "的", "手", "机", "号", "码", "是", "12345678901", "邮", "箱", "是", "12345678", "qq.com", "ip", "是", "1.1.1.1", "information", "created", "automatically"] +-- !sql -- +["来到", "北京", "清华", "清华大学", "华大", "大学"] +["我爱你", "中国"] +["人民", "得到", "更多", "实惠"] +["陕西", "陕西省", "西安", "西安市", "高新", "高新区", "新区", "创业", "业大", "大厦", "A", "座", "手机", "手机号", "手机号码", "机号码", "号码", "12345678901", "邮箱", "12345678", "qq", "com", "ip", "information", "created", "automatically"] + diff --git a/regression-test/suites/inverted_index_p0/test_tokenize.groovy b/regression-test/suites/inverted_index_p0/test_tokenize.groovy index 572e432f6e..7780329da0 100644 --- a/regression-test/suites/inverted_index_p0/test_tokenize.groovy +++ b/regression-test/suites/inverted_index_p0/test_tokenize.groovy @@ -47,8 +47,8 @@ suite("test_tokenize"){ logger.info("show variales result: " + var_result ) sql "INSERT INTO $indexTblName VALUES (1, '我来到北京清华大学'), (2, '我爱你中国'), (3, '人民可以得到更多实惠');" - qt_sql "SELECT TOKENIZE(c, 'chinese') FROM $indexTblName"; - qt_sql "SELECT TOKENIZE(c, 'chinese') FROM $indexTblName WHERE c MATCH '人民'"; + qt_sql "SELECT TOKENIZE(c, \"'parser'='chinese'\") FROM $indexTblName"; + qt_sql "SELECT TOKENIZE(c, \"'parser'='chinese'\") FROM $indexTblName WHERE c MATCH '人民'"; def indexTblName2 = "tokenize_test2" @@ -69,5 +69,26 @@ suite("test_tokenize"){ """ sql "INSERT INTO $indexTblName2 VALUES (1, '我来到北京清华大学'), (2, '我爱你中国'), (3, '人民可以得到更多实惠'), (4, '陕西省西安市高新区创业大厦A座,我的手机号码是12345678901,邮箱是12345678@qq.com,,ip是1.1.1.1,this information is created automatically.');" - qt_sql "SELECT TOKENIZE(c, 'unicode') FROM $indexTblName2"; + qt_sql "SELECT TOKENIZE(c, \"'parser'='unicode'\") FROM $indexTblName2"; + + def indexTblName3 = "tokenize_test3" + + sql "DROP TABLE IF EXISTS ${indexTblName3}" + // create 1 replica table + sql """ + CREATE TABLE IF NOT EXISTS ${indexTblName3}( + `id`int(11)NULL, + `c` text NULL, + INDEX c_idx(`c`) USING INVERTED PROPERTIES("parser"="unicode") COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + COMMENT 'OLAP' + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + PROPERTIES( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + sql "INSERT INTO $indexTblName3 VALUES (1, '我来到北京清华大学'), (2, '我爱你中国'), (3, '人民可以得到更多实惠'), (4, '陕西省西安市高新区创业大厦A座,我的手机号码是12345678901,邮箱是12345678@qq.com,,ip是1.1.1.1,this information is created automatically.');" + qt_sql "SELECT TOKENIZE(c, \"'parser'='chinese','parser_mode'='fine_grained'\") FROM $indexTblName3"; }