[opt](inverted index) the "unicode" tokenizer can be configured to disable stop words. (#34467)

This commit is contained in:
zzzxl
2024-05-07 18:23:43 +08:00
committed by GitHub
parent f48b45700b
commit ac56255f82
13 changed files with 166 additions and 11 deletions

View File

@ -52,6 +52,8 @@ public class InvertedIndexUtil {
public static String INVERTED_INDEX_PARSER_LOWERCASE_KEY = "lower_case";
public static String INVERTED_INDEX_PARSER_STOPWORDS_KEY = "stopwords";
public static String getInvertedIndexParser(Map<String, String> properties) {
String parser = properties == null ? null : properties.get(INVERTED_INDEX_PARSER_KEY);
// default is "none" if not set
@ -136,7 +138,8 @@ public class InvertedIndexUtil {
INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN,
INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT,
INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY,
INVERTED_INDEX_PARSER_LOWERCASE_KEY
INVERTED_INDEX_PARSER_LOWERCASE_KEY,
INVERTED_INDEX_PARSER_STOPWORDS_KEY
));
for (String key : properties.keySet()) {
@ -152,6 +155,7 @@ public class InvertedIndexUtil {
String charFilterPattern = properties.get(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN);
String ignoreAbove = properties.get(INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY);
String lowerCase = properties.get(INVERTED_INDEX_PARSER_LOWERCASE_KEY);
String stopWords = properties.get(INVERTED_INDEX_PARSER_STOPWORDS_KEY);
if (parser != null && !parser.matches("none|english|unicode|chinese|standard")) {
throw new AnalysisException("Invalid inverted index 'parser' value: " + parser
@ -194,5 +198,10 @@ public class InvertedIndexUtil {
throw new AnalysisException(
"Invalid inverted index 'lower_case' value: " + lowerCase + ", lower_case must be true or false");
}
if (stopWords != null && !stopWords.matches("none")) {
throw new AnalysisException("Invalid inverted index 'stopWords' value: " + stopWords
+ ", stopWords must be none");
}
}
}