From ac56255f8270848ca46a4bbcb8d74f9c9f9abc52 Mon Sep 17 00:00:00 2001 From: zzzxl <33418555+zzzxl1993@users.noreply.github.com> Date: Tue, 7 May 2024 18:23:43 +0800 Subject: [PATCH] [opt](inverted index) the "unicode" tokenizer can be configured to disable stop words. (#34467) --- be/src/clucene | 2 +- be/src/olap/inverted_index_parser.cpp | 9 +++ be/src/olap/inverted_index_parser.h | 5 ++ .../char_filter/char_replace_char_filter.h | 4 +- .../segment_v2/inverted_index_reader.cpp | 30 ++++++++-- .../rowset/segment_v2/inverted_index_reader.h | 5 ++ .../segment_v2/inverted_index_writer.cpp | 14 ++++- be/src/vec/functions/function_tokenize.cpp | 4 ++ .../doris/analysis/InvertedIndexUtil.java | 11 +++- .../data/inverted_index_p0/test_stopwords.out | 23 +++++++ .../data/inverted_index_p0/test_tokenize.out | 6 ++ .../inverted_index_p0/test_stopwords.groovy | 60 +++++++++++++++++++ .../inverted_index_p0/test_tokenize.groovy | 4 ++ 13 files changed, 166 insertions(+), 11 deletions(-) create mode 100644 regression-test/data/inverted_index_p0/test_stopwords.out create mode 100644 regression-test/suites/inverted_index_p0/test_stopwords.groovy diff --git a/be/src/clucene b/be/src/clucene index 9f849a47f7..d3de160871 160000 --- a/be/src/clucene +++ b/be/src/clucene @@ -1 +1 @@ -Subproject commit 9f849a47f70625a57fedbaa1f5a6f89bc8f32967 +Subproject commit d3de160871dc1e2e293e5702e5b870e220ed42e4 diff --git a/be/src/olap/inverted_index_parser.cpp b/be/src/olap/inverted_index_parser.cpp index 07a587dd2d..a9ed7ec062 100644 --- a/be/src/olap/inverted_index_parser.cpp +++ b/be/src/olap/inverted_index_parser.cpp @@ -126,4 +126,13 @@ std::string get_parser_ignore_above_value_from_properties( } } +std::string get_parser_stopwords_from_properties( + const std::map& properties) { + if (properties.find(INVERTED_INDEX_PARSER_STOPWORDS_KEY) != properties.end()) { + return properties.at(INVERTED_INDEX_PARSER_STOPWORDS_KEY); + } else { + return ""; + } +} + } // namespace doris diff --git a/be/src/olap/inverted_index_parser.h b/be/src/olap/inverted_index_parser.h index 9df825bf69..5bdfba6a7e 100644 --- a/be/src/olap/inverted_index_parser.h +++ b/be/src/olap/inverted_index_parser.h @@ -79,6 +79,8 @@ const std::string INVERTED_INDEX_PARSER_IGNORE_ABOVE_VALUE = "256"; const std::string INVERTED_INDEX_PARSER_LOWERCASE_KEY = "lower_case"; +const std::string INVERTED_INDEX_PARSER_STOPWORDS_KEY = "stopwords"; + std::string inverted_index_parser_type_to_string(InvertedIndexParserType parser_type); InvertedIndexParserType get_inverted_index_parser_type_from_string(const std::string& parser_str); @@ -113,4 +115,7 @@ std::string get_parser_lowercase_from_properties( } } +std::string get_parser_stopwords_from_properties( + const std::map& properties); + } // namespace doris diff --git a/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.h b/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.h index 2867890b3e..d9e5080d2d 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.h @@ -28,12 +28,14 @@ class CharReplaceCharFilter : public lucene::analysis::CharFilter { public: CharReplaceCharFilter(lucene::util::Reader* in, const std::string& pattern, const std::string& replacement); - virtual ~CharReplaceCharFilter() = default; + ~CharReplaceCharFilter() override = default; void init(const void* _value, int32_t _length, bool copyData) override; int32_t read(const void** start, int32_t min, int32_t max) override; int32_t readCopy(void* start, int32_t off, int32_t len) override; + size_t size() override { return _buf.size(); } + private: void fill(); void process_pattern(std::string& buf); diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp index f08dac8fb9..678c63e078 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp @@ -288,12 +288,8 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run get_parser_mode_string_from_properties(_index_meta.properties()), get_parser_char_filter_map_from_properties(_index_meta.properties())); auto analyzer = create_analyzer(inverted_index_ctx.get()); - auto lowercase = get_parser_lowercase_from_properties(_index_meta.properties()); - if (lowercase == "true") { - analyzer->set_lowercase(true); - } else if (lowercase == "false") { - analyzer->set_lowercase(false); - } + setup_analyzer_lowercase(analyzer, _index_meta.properties()); + setup_analyzer_use_stopwords(analyzer, _index_meta.properties()); inverted_index_ctx->analyzer = analyzer.get(); auto reader = create_reader(inverted_index_ctx.get(), search_str); get_analyse_result(query_info.terms, reader.get(), analyzer.get(), column_name, @@ -382,6 +378,28 @@ InvertedIndexReaderType FullTextIndexReader::type() { return InvertedIndexReaderType::FULLTEXT; } +void FullTextIndexReader::setup_analyzer_lowercase( + std::unique_ptr& analyzer, + const std::map& properties) { + auto lowercase = get_parser_lowercase_from_properties(properties); + if (lowercase == INVERTED_INDEX_PARSER_TRUE) { + analyzer->set_lowercase(true); + } else if (lowercase == INVERTED_INDEX_PARSER_FALSE) { + analyzer->set_lowercase(false); + } +} + +void FullTextIndexReader::setup_analyzer_use_stopwords( + std::unique_ptr& analyzer, + const std::map& properties) { + auto stop_words = get_parser_stopwords_from_properties(properties); + if (stop_words == "none") { + analyzer->set_stopwords(nullptr); + } else { + analyzer->set_stopwords(&lucene::analysis::standard95::stop_words); + } +} + Status StringTypeInvertedIndexReader::new_iterator( OlapReaderStatistics* stats, RuntimeState* runtime_state, std::unique_ptr* iterator) { diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.h b/be/src/olap/rowset/segment_v2/inverted_index_reader.h index 64f78bd52d..bf56d31a2d 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.h @@ -174,6 +174,11 @@ public: InvertedIndexReaderType type() override; + static void setup_analyzer_lowercase(std::unique_ptr& analyzer, + const std::map& properties); + static void setup_analyzer_use_stopwords(std::unique_ptr& analyzer, + const std::map& properties); + private: Status match_index_search(OlapReaderStatistics* stats, RuntimeState* runtime_state, InvertedIndexQueryType query_type, diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp index 1fb5e8cc1f..189cf71bef 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp @@ -222,6 +222,7 @@ public: break; } setup_analyzer_lowercase(analyzer); + setup_analyzer_use_stopwords(analyzer); return Status::OK(); } catch (CLuceneError& e) { return Status::Error( @@ -231,13 +232,22 @@ public: void setup_analyzer_lowercase(std::unique_ptr& analyzer) { auto lowercase = get_parser_lowercase_from_properties(_index_meta->properties()); - if (lowercase == "true") { + if (lowercase == INVERTED_INDEX_PARSER_TRUE) { analyzer->set_lowercase(true); - } else if (lowercase == "false") { + } else if (lowercase == INVERTED_INDEX_PARSER_FALSE) { analyzer->set_lowercase(false); } } + void setup_analyzer_use_stopwords(std::unique_ptr& analyzer) { + auto stop_words = get_parser_stopwords_from_properties(_index_meta->properties()); + if (stop_words == "none") { + analyzer->set_stopwords(nullptr); + } else { + analyzer->set_stopwords(&lucene::analysis::standard95::stop_words); + } + } + Status init_fulltext_index() { RETURN_IF_ERROR(open_index_directory()); RETURN_IF_ERROR(create_char_string_reader(_char_string_reader)); diff --git a/be/src/vec/functions/function_tokenize.cpp b/be/src/vec/functions/function_tokenize.cpp index 1d9edbd7db..a1ea2e0be9 100644 --- a/be/src/vec/functions/function_tokenize.cpp +++ b/be/src/vec/functions/function_tokenize.cpp @@ -26,6 +26,7 @@ #include "CLucene/StdHeader.h" #include "CLucene/config/repl_wchar.h" #include "olap/inverted_index_parser.h" +#include "olap/rowset/segment_v2/inverted_index_reader.h" #include "vec/columns/column.h" #include "vec/common/string_ref.h" #include "vec/core/block.h" @@ -151,6 +152,9 @@ Status FunctionTokenize::execute_impl(FunctionContext* /*context*/, Block& block return Status::Error( "inverted index create analyzer failed: {}", e.what()); } + doris::segment_v2::FullTextIndexReader::setup_analyzer_lowercase(analyzer, properties); + doris::segment_v2::FullTextIndexReader::setup_analyzer_use_stopwords(analyzer, + properties); inverted_index_ctx.analyzer = analyzer.get(); _do_tokenize(*col_left, inverted_index_ctx, *dest_nested_column, dest_offsets, diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java index b57eae7746..a2b0aa623c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java @@ -52,6 +52,8 @@ public class InvertedIndexUtil { public static String INVERTED_INDEX_PARSER_LOWERCASE_KEY = "lower_case"; + public static String INVERTED_INDEX_PARSER_STOPWORDS_KEY = "stopwords"; + public static String getInvertedIndexParser(Map properties) { String parser = properties == null ? null : properties.get(INVERTED_INDEX_PARSER_KEY); // default is "none" if not set @@ -136,7 +138,8 @@ public class InvertedIndexUtil { INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN, INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT, INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY, - INVERTED_INDEX_PARSER_LOWERCASE_KEY + INVERTED_INDEX_PARSER_LOWERCASE_KEY, + INVERTED_INDEX_PARSER_STOPWORDS_KEY )); for (String key : properties.keySet()) { @@ -152,6 +155,7 @@ public class InvertedIndexUtil { String charFilterPattern = properties.get(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN); String ignoreAbove = properties.get(INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY); String lowerCase = properties.get(INVERTED_INDEX_PARSER_LOWERCASE_KEY); + String stopWords = properties.get(INVERTED_INDEX_PARSER_STOPWORDS_KEY); if (parser != null && !parser.matches("none|english|unicode|chinese|standard")) { throw new AnalysisException("Invalid inverted index 'parser' value: " + parser @@ -194,5 +198,10 @@ public class InvertedIndexUtil { throw new AnalysisException( "Invalid inverted index 'lower_case' value: " + lowerCase + ", lower_case must be true or false"); } + + if (stopWords != null && !stopWords.matches("none")) { + throw new AnalysisException("Invalid inverted index 'stopWords' value: " + stopWords + + ", stopWords must be none"); + } } } diff --git a/regression-test/data/inverted_index_p0/test_stopwords.out b/regression-test/data/inverted_index_p0/test_stopwords.out new file mode 100644 index 0000000000..ba4940bcc5 --- /dev/null +++ b/regression-test/data/inverted_index_p0/test_stopwords.out @@ -0,0 +1,23 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql -- + +-- !sql -- + +-- !sql -- + +-- !sql -- +1 华夏智胜新税股票A 华夏智胜新税股票A +2 Life is like a box of chocolates, you never know what you are going to get. Life is like a box of chocolates, you never know what you are going to get. + +-- !sql -- +2 Life is like a box of chocolates, you never know what you are going to get. Life is like a box of chocolates, you never know what you are going to get. + +-- !sql -- +2 Life is like a box of chocolates, you never know what you are going to get. Life is like a box of chocolates, you never know what you are going to get. + +-- !sql -- +2 Life is like a box of chocolates, you never know what you are going to get. Life is like a box of chocolates, you never know what you are going to get. + +-- !sql -- +2 Life is like a box of chocolates, you never know what you are going to get. Life is like a box of chocolates, you never know what you are going to get. + diff --git a/regression-test/data/inverted_index_p0/test_tokenize.out b/regression-test/data/inverted_index_p0/test_tokenize.out index 350e218f57..a3984ca910 100644 --- a/regression-test/data/inverted_index_p0/test_tokenize.out +++ b/regression-test/data/inverted_index_p0/test_tokenize.out @@ -22,3 +22,9 @@ -- !tokenize_sql -- ["get", "images", "hm", "bg", "jpg", "http", "1", "0", "test", "abc", "bcd"] +-- !tokenize_sql -- +["华", "夏", "智", "胜", "新", "税", "股", "票"] + +-- !tokenize_sql -- +["华", "夏", "智", "胜", "新", "税", "股", "票", "a"] + diff --git a/regression-test/suites/inverted_index_p0/test_stopwords.groovy b/regression-test/suites/inverted_index_p0/test_stopwords.groovy new file mode 100644 index 0000000000..4f7c577dc5 --- /dev/null +++ b/regression-test/suites/inverted_index_p0/test_stopwords.groovy @@ -0,0 +1,60 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +suite("test_stopwords", "p0"){ + def indexTbName = "test_stopwords" + + sql "DROP TABLE IF EXISTS ${indexTbName}" + + sql """ + CREATE TABLE ${indexTbName} ( + `a` int(11) NULL COMMENT "", + `b` text NULL COMMENT "", + `c` text NULL COMMENT "", + INDEX b_idx (`b`) USING INVERTED PROPERTIES("parser" = "unicode") COMMENT '', + INDEX c_idx (`c`) USING INVERTED PROPERTIES("parser" = "unicode", "stopwords" = "none") COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`a`) + COMMENT "OLAP" + DISTRIBUTED BY RANDOM BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + sql """ INSERT INTO ${indexTbName} VALUES (1, "华夏智胜新税股票A", "华夏智胜新税股票A"); """ + sql """ INSERT INTO ${indexTbName} VALUES (2, "Life is like a box of chocolates, you never know what you are going to get. ", "Life is like a box of chocolates, you never know what you are going to get. "); """ + + try { + sql "sync" + + qt_sql """ select * from ${indexTbName} where b match 'a'; """ + qt_sql """ select * from ${indexTbName} where b match 'are'; """ + qt_sql """ select * from ${indexTbName} where b match 'to'; """ + + qt_sql """ select * from ${indexTbName} where c match 'a'; """ + qt_sql """ select * from ${indexTbName} where c match 'are'; """ + qt_sql """ select * from ${indexTbName} where c match 'to'; """ + + qt_sql """ select * from ${indexTbName} where b match_phrase 'like a box'; """ + qt_sql """ select * from ${indexTbName} where c match_phrase 'like a box'; """ + + } finally { + //try_sql("DROP TABLE IF EXISTS ${testTable}") + } +} diff --git a/regression-test/suites/inverted_index_p0/test_tokenize.groovy b/regression-test/suites/inverted_index_p0/test_tokenize.groovy index 5b5c4f02a4..a03b2c85a5 100644 --- a/regression-test/suites/inverted_index_p0/test_tokenize.groovy +++ b/regression-test/suites/inverted_index_p0/test_tokenize.groovy @@ -93,4 +93,8 @@ suite("test_tokenize"){ qt_sql "SELECT TOKENIZE(c, \"'parser'='chinese','parser_mode'='fine_grained'\") FROM $indexTblName3"; qt_tokenize_sql """SELECT TOKENIZE('GET /images/hm_bg.jpg HTTP/1.0 test:abc=bcd','"parser"="unicode","char_filter_type" = "char_replace","char_filter_pattern" = "._=:,","char_filter_replacement" = " "');""" + qt_tokenize_sql """SELECT TOKENIZE('GET /images/hm_bg.jpg HTTP/1.0 test:abc=bcd', '"parser"="unicode","char_filter_type" = "char_replace", "char_filter_pattern" = "._=:,", "char_filter_replacement" = " "');""" + + qt_tokenize_sql """SELECT TOKENIZE('华夏智胜新税股票A', '"parser"="unicode"');""" + qt_tokenize_sql """SELECT TOKENIZE('华夏智胜新税股票A', '"parser"="unicode","stopwords" = "none"');""" }