From c01230f99ad73d09138bb1c4852fbd4489ddd5d7 Mon Sep 17 00:00:00 2001 From: YueW <45946325+Tanya-W@users.noreply.github.com> Date: Tue, 25 Jul 2023 14:22:37 +0800 Subject: [PATCH] [fix](match) Optimize the logic for match_phrase function filter (#21622) --- be/src/vec/functions/match.cpp | 21 +++++++++++-------- .../test_chinese_analyzer.out | 13 ++++++++++++ .../test_chinese_analyzer.groovy | 5 +++++ 3 files changed, 30 insertions(+), 9 deletions(-) diff --git a/be/src/vec/functions/match.cpp b/be/src/vec/functions/match.cpp index 63877bfbd8..a142d8588c 100644 --- a/be/src/vec/functions/match.cpp +++ b/be/src/vec/functions/match.cpp @@ -236,21 +236,24 @@ Status FunctionMatchPhrase::execute_match(const std::string& column_name, // TODO: more efficient impl bool matched = false; - auto it = data_tokens.begin(); - while (it != data_tokens.end()) { + auto data_it = data_tokens.begin(); + while (data_it != data_tokens.end()) { // find position of first token - it = std::find(it, data_tokens.end(), query_tokens[0]); - if (it != data_tokens.end()) { + data_it = std::find(data_it, data_tokens.end(), query_tokens[0]); + if (data_it != data_tokens.end()) { matched = true; - it++; - auto it_more = it; + auto data_it_next = ++data_it; + auto query_it = query_tokens.begin() + 1; // compare query_tokens after the first to data_tokens one by one - for (size_t idx = 1; idx < query_tokens.size(); idx++) { - if (it_more == data_tokens.end() || *it_more != query_tokens[idx]) { + while (query_it != query_tokens.end()) { + if (data_it_next == data_tokens.end() || *data_it_next != *query_it) { matched = false; + break; } - it_more++; + query_it++; + data_it_next++; } + if (matched) { break; } diff --git a/regression-test/data/inverted_index_p0/test_chinese_analyzer.out b/regression-test/data/inverted_index_p0/test_chinese_analyzer.out index 91045120ab..245c5d567d 100644 --- a/regression-test/data/inverted_index_p0/test_chinese_analyzer.out +++ b/regression-test/data/inverted_index_p0/test_chinese_analyzer.out @@ -17,6 +17,14 @@ -- !sql -- 2 我爱你中国 +-- !sql -- +2 我爱你中国 + +-- !sql -- +2 我爱你中国 + +-- !sql -- + -- !sql -- -- !sql -- @@ -33,6 +41,11 @@ -- !sql -- 2 我爱你中国 +-- !sql -- + +-- !sql -- +2 我爱你中国 + -- !sql -- 1 我来到北京清华大学 diff --git a/regression-test/suites/inverted_index_p0/test_chinese_analyzer.groovy b/regression-test/suites/inverted_index_p0/test_chinese_analyzer.groovy index dd375f3894..8278170cd4 100644 --- a/regression-test/suites/inverted_index_p0/test_chinese_analyzer.groovy +++ b/regression-test/suites/inverted_index_p0/test_chinese_analyzer.groovy @@ -73,6 +73,9 @@ suite("test_chinese_analyzer"){ sql "INSERT INTO $indexTblName2 VALUES (1, '我来到北京清华大学'), (2, '我爱你中国'), (3, '人民可以得到更多实惠');" qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH '我爱你' ORDER BY id;" + qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH_PHRASE '我爱你' ORDER BY id;" + qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH_PHRASE '我爱你 中国' ORDER BY id;" + qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH_PHRASE '北京 大学' ORDER BY id;" qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH '清华' ORDER BY id;" qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH '大学' ORDER BY id;" qt_sql "SELECT * FROM $indexTblName2 WHERE c MATCH '清华大学' ORDER BY id;" @@ -98,6 +101,8 @@ suite("test_chinese_analyzer"){ sql "INSERT INTO $indexTblName3 VALUES (1, '我来到北京清华大学'), (2, '我爱你中国'), (3, '人民可以得到更多实惠'), (4, '陕西省西安市高新区创业大厦A座,我的手机号码是12345678901,邮箱是12345678@qq.com,,ip是1.1.1.1,this information is created automatically.');" qt_sql "SELECT * FROM $indexTblName3 WHERE c MATCH_PHRASE '我爱你' ORDER BY id;" + qt_sql "SELECT * FROM $indexTblName3 WHERE c MATCH_PHRASE '我爱你 中国' ORDER BY id;" + qt_sql "SELECT * FROM $indexTblName3 WHERE c MATCH_PHRASE '北京 大学' ORDER BY id;" qt_sql "SELECT * FROM $indexTblName3 WHERE c MATCH_ALL'我爱你' ORDER BY id;" qt_sql "SELECT * FROM $indexTblName3 WHERE c MATCH_ALL '清华' ORDER BY id;" qt_sql "SELECT * FROM $indexTblName3 WHERE c MATCH_ALL '大学' ORDER BY id;"