From 0cd51835561dea334d59d823011d9b6ec86906a3 Mon Sep 17 00:00:00 2001
From: airborne12 <airborne08@gmail.com>
Date: Wed, 2 Aug 2023 19:12:22 +0800
Subject: [PATCH] [Refactor](inverted index) refact tokenize function for
 inverted index (#22313)

---
 be/src/vec/functions/function_tokenize.cpp    | 81 ++++++++++++-------
 be/src/vec/functions/function_tokenize.h      |  7 +-
 .../inverted_index_p0/test_array_index.out    |  1 +
 .../data/inverted_index_p0/test_tokenize.out  |  6 ++
 .../inverted_index_p0/test_tokenize.groovy    | 27 ++++++-
 5 files changed, 89 insertions(+), 33 deletions(-)

diff --git a/be/src/vec/functions/function_tokenize.cpp b/be/src/vec/functions/function_tokenize.cpp
index fb311ca627..c7764bcf49 100644
--- a/be/src/vec/functions/function_tokenize.cpp
+++ b/be/src/vec/functions/function_tokenize.cpp
@@ -25,7 +25,6 @@
 #include "CLucene/StdHeader.h"
 #include "CLucene/config/repl_wchar.h"
 #include "olap/inverted_index_parser.h"
-#include "olap/rowset/segment_v2/inverted_index_reader.h"
 #include "vec/columns/column.h"
 #include "vec/common/string_ref.h"
 #include "vec/core/block.h"
@@ -37,11 +36,49 @@
 
 namespace doris::vectorized {
 
-void FunctionTokenize::_execute_constant(const ColumnString& src_column_string,
-                                         const StringRef& tokenize_type,
-                                         IColumn& dest_nested_column,
-                                         ColumnArray::Offsets64& dest_offsets,
-                                         NullMapType* dest_nested_null_map) {
+Status parse(const std::string& str, std::map<std::string, std::string>& result) {
+    std::string::size_type start = 0;
+
+    while (start < str.size()) {
+        std::string::size_type end = str.find(',', start);
+        std::string pair =
+                (end == std::string::npos) ? str.substr(start) : str.substr(start, end - start);
+
+        std::string::size_type eq_pos = pair.find('=');
+        if (eq_pos == std::string::npos) {
+            return Status::InvalidArgument(
+                    fmt::format("invalid params {} for function tokenize", str));
+        }
+        std::string key = pair.substr(0, eq_pos);
+        key = key.substr(key.find_first_not_of(" '\""
+                                               "\t\n\r"),
+                         key.find_last_not_of(" '\""
+                                              "\t\n\r") -
+                                 key.find_first_not_of(" '\""
+                                                       "\t\n\r") +
+                                 1);
+        std::string value = pair.substr(eq_pos + 1);
+        value = value.substr(value.find_first_not_of(" '\""
+                                                     "\t\n\r"),
+                             value.find_last_not_of(" '\""
+                                                    "\t\n\r") -
+                                     value.find_first_not_of(" '\""
+                                                             "\t\n\r") +
+                                     1);
+
+        result[key] = value;
+
+        start = (end == std::string::npos) ? str.size() : end + 1;
+    }
+
+    return Status::OK();
+}
+
+void FunctionTokenize::_do_tokenize(const ColumnString& src_column_string,
+                                    InvertedIndexCtx& inverted_index_ctx,
+                                    IColumn& dest_nested_column,
+                                    ColumnArray::Offsets64& dest_offsets,
+                                    NullMapType* dest_nested_null_map) {
     ColumnString& dest_column_string = reinterpret_cast<ColumnString&>(dest_nested_column);
     ColumnString::Chars& column_string_chars = dest_column_string.get_chars();
     ColumnString::Offsets& column_string_offsets = dest_column_string.get_offsets();
@@ -51,25 +88,6 @@ void FunctionTokenize::_execute_constant(const ColumnString& src_column_string,
     ColumnArray::Offset64 dest_pos = 0;
     ColumnArray::Offset64 src_offsets_size = src_column_string.get_offsets().size();
 
-    InvertedIndexCtx inverted_index_ctx;
-    auto parser_type = get_inverted_index_parser_type_from_string(tokenize_type.to_string());
-
-    switch (parser_type) {
-    case InvertedIndexParserType::PARSER_CHINESE: {
-        // we don't support parse_mode params now, so make it default.
-        inverted_index_ctx.parser_mode = INVERTED_INDEX_PARSER_COARSE_GRANULARITY;
-        inverted_index_ctx.parser_type = parser_type;
-        break;
-    }
-    case InvertedIndexParserType::PARSER_UNICODE: {
-        inverted_index_ctx.parser_type = parser_type;
-        break;
-    }
-    default:
-        // default as english
-        inverted_index_ctx.parser_type = InvertedIndexParserType::PARSER_ENGLISH;
-    }
-
     for (size_t i = 0; i < src_offsets_size; i++) {
         const StringRef tokenize_str = src_column_string.get_data_at(i);
 
@@ -129,8 +147,17 @@ Status FunctionTokenize::execute_impl(FunctionContext* /*context*/, Block& block
 
     if (auto col_left = check_and_get_column<ColumnString>(src_column.get())) {
         if (auto col_right = check_and_get_column<ColumnString>(right_column.get())) {
-            _execute_constant(*col_left, col_right->get_data_at(0), *dest_nested_column,
-                              dest_offsets, dest_nested_null_map);
+            InvertedIndexCtx inverted_index_ctx;
+            std::map<std::string, std::string> properties;
+            auto st = parse(col_right->get_data_at(0).to_string(), properties);
+            if (!st.ok()) {
+                return st;
+            }
+            inverted_index_ctx.parser_type = get_inverted_index_parser_type_from_string(
+                    get_parser_string_from_properties(properties));
+            inverted_index_ctx.parser_mode = get_parser_mode_string_from_properties(properties);
+            _do_tokenize(*col_left, inverted_index_ctx, *dest_nested_column, dest_offsets,
+                         dest_nested_null_map);
 
             block.replace_by_position(result, std::move(dest_column_ptr));
             return Status::OK();
diff --git a/be/src/vec/functions/function_tokenize.h b/be/src/vec/functions/function_tokenize.h
index a3145c58c8..20b4f74284 100644
--- a/be/src/vec/functions/function_tokenize.h
+++ b/be/src/vec/functions/function_tokenize.h
@@ -24,6 +24,7 @@
 #include <vector>
 
 #include "common/status.h"
+#include "olap/rowset/segment_v2/inverted_index_reader.h"
 #include "udf/udf.h"
 #include "vec/columns/column_array.h"
 #include "vec/core/column_numbers.h"
@@ -64,9 +65,9 @@ public:
                 << " and arguments[1] is " << arguments[1]->get_name();
         return std::make_shared<DataTypeArray>(make_nullable(arguments[0]));
     }
-    void _execute_constant(const ColumnString& src_column_string, const StringRef& delimiter_ref,
-                           IColumn& dest_nested_column, ColumnArray::Offsets64& dest_offsets,
-                           NullMapType* dest_nested_null_map);
+    void _do_tokenize(const ColumnString& src_column_string, InvertedIndexCtx& inverted_index_ctx,
+                      IColumn& dest_nested_column, ColumnArray::Offsets64& dest_offsets,
+                      NullMapType* dest_nested_null_map);
     Status execute_impl(FunctionContext* /*context*/, Block& block, const ColumnNumbers& arguments,
                         size_t result, size_t /*input_rows_count*/) override;
 
diff --git a/regression-test/data/inverted_index_p0/test_array_index.out b/regression-test/data/inverted_index_p0/test_array_index.out
index 0d7529c8b6..c23617fbdb 100644
--- a/regression-test/data/inverted_index_p0/test_array_index.out
+++ b/regression-test/data/inverted_index_p0/test_array_index.out
@@ -68,3 +68,4 @@
 
 -- !sql --
 2	[20, 30, 40]	["i", "love", "north korea"]
+
diff --git a/regression-test/data/inverted_index_p0/test_tokenize.out b/regression-test/data/inverted_index_p0/test_tokenize.out
index b99b79435b..731ae4249e 100644
--- a/regression-test/data/inverted_index_p0/test_tokenize.out
+++ b/regression-test/data/inverted_index_p0/test_tokenize.out
@@ -13,3 +13,9 @@
 ["人", "民", "可", "以", "得", "到", "更", "多", "实", "惠"]
 ["陕", "西", "省", "西", "安", "市", "高", "新", "区", "创", "业", "大", "厦", "座", "我", "的", "手", "机", "号", "码", "是", "12345678901", "邮", "箱", "是", "12345678", "qq.com", "ip", "是", "1.1.1.1", "information", "created", "automatically"]
 
+-- !sql --
+["来到", "北京", "清华", "清华大学", "华大", "大学"]
+["我爱你", "中国"]
+["人民", "得到", "更多", "实惠"]
+["陕西", "陕西省", "西安", "西安市", "高新", "高新区", "新区", "创业", "业大", "大厦", "A", "座", "手机", "手机号", "手机号码", "机号码", "号码", "12345678901", "邮箱", "12345678", "qq", "com", "ip", "information", "created", "automatically"]
+
diff --git a/regression-test/suites/inverted_index_p0/test_tokenize.groovy b/regression-test/suites/inverted_index_p0/test_tokenize.groovy
index 572e432f6e..7780329da0 100644
--- a/regression-test/suites/inverted_index_p0/test_tokenize.groovy
+++ b/regression-test/suites/inverted_index_p0/test_tokenize.groovy
@@ -47,8 +47,8 @@ suite("test_tokenize"){
     logger.info("show variales result: " + var_result )
 
     sql "INSERT INTO $indexTblName VALUES (1, '我来到北京清华大学'), (2, '我爱你中国'), (3, '人民可以得到更多实惠');"
-    qt_sql "SELECT TOKENIZE(c, 'chinese') FROM $indexTblName";
-    qt_sql "SELECT TOKENIZE(c, 'chinese') FROM $indexTblName WHERE c MATCH '人民'";
+    qt_sql "SELECT TOKENIZE(c, \"'parser'='chinese'\") FROM $indexTblName";
+    qt_sql "SELECT TOKENIZE(c, \"'parser'='chinese'\") FROM $indexTblName WHERE c MATCH '人民'";
 
     def indexTblName2 = "tokenize_test2"
 
@@ -69,5 +69,26 @@ suite("test_tokenize"){
     """
 
     sql "INSERT INTO $indexTblName2 VALUES (1, '我来到北京清华大学'), (2, '我爱你中国'), (3, '人民可以得到更多实惠'), (4, '陕西省西安市高新区创业大厦A座，我的手机号码是12345678901,邮箱是12345678@qq.com，,ip是1.1.1.1，this information is created automatically.');"
-    qt_sql "SELECT TOKENIZE(c, 'unicode') FROM $indexTblName2";
+    qt_sql "SELECT TOKENIZE(c, \"'parser'='unicode'\") FROM $indexTblName2";
+
+    def indexTblName3 = "tokenize_test3"
+
+    sql "DROP TABLE IF EXISTS ${indexTblName3}"
+    // create 1 replica table
+    sql """
+	CREATE TABLE IF NOT EXISTS ${indexTblName3}(
+		`id`int(11)NULL,
+		`c` text NULL,
+		INDEX c_idx(`c`) USING INVERTED PROPERTIES("parser"="unicode") COMMENT ''
+	) ENGINE=OLAP
+	DUPLICATE KEY(`id`)
+	COMMENT 'OLAP'
+	DISTRIBUTED BY HASH(`id`) BUCKETS 1
+	PROPERTIES(
+                "replication_allocation" = "tag.location.default: 1"
+        );
+    """
+
+    sql "INSERT INTO $indexTblName3 VALUES (1, '我来到北京清华大学'), (2, '我爱你中国'), (3, '人民可以得到更多实惠'), (4, '陕西省西安市高新区创业大厦A座，我的手机号码是12345678901,邮箱是12345678@qq.com，,ip是1.1.1.1，this information is created automatically.');"
+    qt_sql "SELECT TOKENIZE(c, \"'parser'='chinese','parser_mode'='fine_grained'\") FROM $indexTblName3";
 }