From c01230f99ad73d09138bb1c4852fbd4489ddd5d7 Mon Sep 17 00:00:00 2001
From: YueW <45946325+Tanya-W@users.noreply.github.com>
Date: Tue, 25 Jul 2023 14:22:37 +0800
Subject: [PATCH] [fix](match) Optimize the logic for match_phrase function
 filter (#21622)

---
 be/src/vec/functions/match.cpp                | 21 +++++++++++--------
 .../test_chinese_analyzer.out                 | 13 ++++++++++++
 .../test_chinese_analyzer.groovy              |  5 +++++
 3 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/be/src/vec/functions/match.cpp b/be/src/vec/functions/match.cpp
index 63877bfbd8..a142d8588c 100644
--- a/be/src/vec/functions/match.cpp
+++ b/be/src/vec/functions/match.cpp
@@ -236,21 +236,24 @@ Status FunctionMatchPhrase::execute_match(const std::string& column_name,
 
         // TODO: more efficient impl
         bool matched = false;
-        auto it = data_tokens.begin();
-        while (it != data_tokens.end()) {
+        auto data_it = data_tokens.begin();
+        while (data_it != data_tokens.end()) {
             // find position of first token
-            it = std::find(it, data_tokens.end(), query_tokens[0]);
-            if (it != data_tokens.end()) {
+            data_it = std::find(data_it, data_tokens.end(), query_tokens[0]);
+            if (data_it != data_tokens.end()) {
                 matched = true;
-                it++;
-                auto it_more = it;
+                auto data_it_next = ++data_it;
+                auto query_it = query_tokens.begin() + 1;
                 // compare query_tokens after the first to data_tokens one by one
-                for (size_t idx = 1; idx < query_tokens.size(); idx++) {
-                    if (it_more == data_tokens.end() || *it_more != query_tokens[idx]) {
+                while (query_it != query_tokens.end()) {
+                    if (data_it_next == data_tokens.end() || *data_it_next != *query_it) {
                         matched = false;
+                        break;
                     }
-                    it_more++;
+                    query_it++;
+                    data_it_next++;
                 }
+
                 if (matched) {
                     break;
                 }
diff --git a/regression-test/data/inverted_index_p0/test_chinese_analyzer.out b/regression-test/data/inverted_index_p0/test_chinese_analyzer.out
index 91045120ab..245c5d567d 100644
--- a/regression-test/data/inverted_index_p0/test_chinese_analyzer.out
+++ b/regression-test/data/inverted_index_p0/test_chinese_analyzer.out
@@ -17,6 +17,14 @@
 -- !sql --
 2	我爱你中国
 
+-- !sql --
+2	我爱你中国
+
+-- !sql --
+2	我爱你中国
+
+-- !sql --
+
 -- !sql --
 
 -- !sql --
@@ -33,6 +41,11 @@
 -- !sql --
 2	我爱你中国
 
+-- !sql --
+
+-- !sql --
+2	我爱你中国
+
 -- !sql --
 1	我来到北京清华大学
 
diff --git a/regression-test/suites/inverted_index_p0/test_chinese_analyzer.groovy b/regression-test/suites/inverted_index_p0/test_chinese_analyzer.groovy
index dd375f3894..8278170cd4 100644
--- a/regression-test/suites/inverted_index_p0/test_chinese_analyzer.groovy
+++ b/regression-test/suites/inverted_index_p0/test_chinese_analyzer.groovy
@@ -73,6 +73,9 @@ suite("test_chinese_analyzer"){
 
     sql "INSERT INTO $indexTblName2 VALUES (1, '我来到北京清华大学'), (2, '我爱你中国'), (3, '人民可以得到更多实惠');"
     qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH '我爱你' ORDER BY id;"
+    qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH_PHRASE '我爱你' ORDER BY id;"
+    qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH_PHRASE '我爱你 中国' ORDER BY id;"
+    qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH_PHRASE '北京 大学' ORDER BY id;"
     qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH '清华' ORDER BY id;"
     qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH '大学' ORDER BY id;"
     qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH '清华大学' ORDER BY id;"
@@ -98,6 +101,8 @@ suite("test_chinese_analyzer"){
 
     sql "INSERT INTO $indexTblName3 VALUES (1, '我来到北京清华大学'), (2, '我爱你中国'), (3, '人民可以得到更多实惠'), (4, '陕西省西安市高新区创业大厦A座，我的手机号码是12345678901,邮箱是12345678@qq.com，,ip是1.1.1.1，this information is created automatically.');"
     qt_sql "SELECT * FROM $indexTblName3 WHERE c MATCH_PHRASE '我爱你' ORDER BY id;"
+    qt_sql "SELECT * FROM $indexTblName3 WHERE c MATCH_PHRASE '我爱你 中国' ORDER BY id;"
+    qt_sql "SELECT * FROM $indexTblName3 WHERE c MATCH_PHRASE '北京 大学' ORDER BY id;"
     qt_sql "SELECT * FROM $indexTblName3 WHERE c MATCH_ALL'我爱你' ORDER BY id;"
     qt_sql "SELECT * FROM $indexTblName3 WHERE c MATCH_ALL '清华' ORDER BY id;"
     qt_sql "SELECT * FROM $indexTblName3 WHERE c MATCH_ALL '大学' ORDER BY id;"