[opt](inverted index) the "unicode" tokenizer can be configured to disable stop words. (#34467)

2024-05-07 18:23:43 +08:00
parent f48b45700b
commit ac56255f82
13 changed files with 166 additions and 11 deletions
--- a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java
@ -52,6 +52,8 @@ public class InvertedIndexUtil {

    public static String INVERTED_INDEX_PARSER_LOWERCASE_KEY = "lower_case";

+    public static String INVERTED_INDEX_PARSER_STOPWORDS_KEY = "stopwords";
+
    public static String getInvertedIndexParser(Map<String, String> properties) {
        String parser = properties == null ? null : properties.get(INVERTED_INDEX_PARSER_KEY);
        // default is "none" if not set
@ -136,7 +138,8 @@ public class InvertedIndexUtil {
                INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN,
                INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT,
                INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY,
-                INVERTED_INDEX_PARSER_LOWERCASE_KEY
+                INVERTED_INDEX_PARSER_LOWERCASE_KEY,
+                INVERTED_INDEX_PARSER_STOPWORDS_KEY
        ));

        for (String key : properties.keySet()) {
@ -152,6 +155,7 @@ public class InvertedIndexUtil {
        String charFilterPattern = properties.get(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN);
        String ignoreAbove = properties.get(INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY);
        String lowerCase = properties.get(INVERTED_INDEX_PARSER_LOWERCASE_KEY);
+        String stopWords = properties.get(INVERTED_INDEX_PARSER_STOPWORDS_KEY);

        if (parser != null && !parser.matches("none|english|unicode|chinese|standard")) {
            throw new AnalysisException("Invalid inverted index 'parser' value: " + parser
@ -194,5 +198,10 @@ public class InvertedIndexUtil {
            throw new AnalysisException(
                    "Invalid inverted index 'lower_case' value: " + lowerCase + ", lower_case must be true or false");
        }
+
+        if (stopWords != null && !stopWords.matches("none")) {
+            throw new AnalysisException("Invalid inverted index 'stopWords' value: " + stopWords
+                    + ", stopWords must be none");
+        }
    }
 }