[feature-wip](inverted index) API for inverted index reader and syntax for fulltext match (#14211)

* [feature-wip](inverted index)inverted index api: reader

* [feature-wip](inverted index) Fulltext query syntax with MATCH/MATCH_ALL/MATCH_ALL

* [feature-wip](inverted index) Adapt to index meta

* [enhance] add more metrics

* [enhance] add fulltext match query check for column type and index parser

* [feature-wip](inverted index) Support apply inverted index in compound predicate which except leaf node of and node
This commit is contained in:
YueW
2022-12-30 21:48:14 +08:00
committed by GitHub
parent b23d068281
commit edecc2e706
45 changed files with 1450 additions and 31 deletions

View File

@ -446,6 +446,15 @@ terminal String
KW_MINUTE,
KW_MODIFY,
KW_MONTH,
KW_MATCH,
KW_MATCH_ANY,
KW_MATCH_ALL,
KW_MATCH_PHRASE,
KW_MATCH_ELEMENT_EQ,
KW_MATCH_ELEMENT_LT,
KW_MATCH_ELEMENT_GT,
KW_MATCH_ELEMENT_LE,
KW_MATCH_ELEMENT_GE,
KW_NAME,
KW_NAMES,
KW_NATURAL,
@ -698,7 +707,7 @@ nonterminal Expr where_clause;
nonterminal Expr delete_on_clause;
nonterminal String sequence_col_clause;
nonterminal Predicate predicate, between_predicate, comparison_predicate,
compound_predicate, in_predicate, like_predicate, exists_predicate;
compound_predicate, in_predicate, like_predicate, exists_predicate, match_predicate;
nonterminal ArrayList<Expr> opt_partition_by_clause;
nonterminal Expr having_clause;
nonterminal ArrayList<OrderByElement> order_by_elements, order_by_clause;
@ -899,6 +908,7 @@ precedence left KW_AND;
precedence left KW_NOT, NOT;
precedence left KW_BETWEEN, KW_IN, KW_IS, KW_EXISTS;
precedence left KW_LIKE, KW_REGEXP;
precedence left KW_MATCH_ANY, KW_MATCH_ALL, KW_MATCH_PHRASE, KW_MATCH, KW_MATCH_ELEMENT_EQ, KW_MATCH_ELEMENT_LT, KW_MATCH_ELEMENT_GT, KW_MATCH_ELEMENT_LE, KW_MATCH_ELEMENT_GE;
precedence left EQUAL, LESSTHAN, GREATERTHAN;
precedence left ADD, SUBTRACT;
precedence left AT, STAR, DIVIDE, MOD, KW_DIV;
@ -6150,6 +6160,8 @@ predicate ::=
{: RESULT = p; :}
| like_predicate:p
{: RESULT = p; :}
| match_predicate:p
{: RESULT = p; :}
| LPAREN predicate:p RPAREN
{:
p.setPrintSqlInParens(true);
@ -6189,6 +6201,27 @@ like_predicate ::=
new LikePredicate(LikePredicate.Operator.REGEXP, e1, e2), null); :}
;
match_predicate ::=
expr:e1 KW_MATCH_ANY expr:e2
{: RESULT = new MatchPredicate(MatchPredicate.Operator.MATCH_ANY, e1, e2); :}
| expr:e1 KW_MATCH expr:e2
{: RESULT = new MatchPredicate(MatchPredicate.Operator.MATCH_ANY, e1, e2); :}
| expr:e1 KW_MATCH_ALL expr:e2
{: RESULT = new MatchPredicate(MatchPredicate.Operator.MATCH_ALL, e1, e2); :}
| expr:e1 KW_MATCH_PHRASE expr:e2
{: RESULT = new MatchPredicate(MatchPredicate.Operator.MATCH_PHRASE, e1, e2); :}
| expr:e1 KW_MATCH_ELEMENT_EQ expr:e2
{: RESULT = new MatchPredicate(MatchPredicate.Operator.MATCH_ELEMENT_EQ, e1, e2); :}
| expr:e1 KW_MATCH_ELEMENT_LT expr:e2
{: RESULT = new MatchPredicate(MatchPredicate.Operator.MATCH_ELEMENT_LT, e1, e2); :}
| expr:e1 KW_MATCH_ELEMENT_GT expr:e2
{: RESULT = new MatchPredicate(MatchPredicate.Operator.MATCH_ELEMENT_GT, e1, e2); :}
| expr:e1 KW_MATCH_ELEMENT_LE expr:e2
{: RESULT = new MatchPredicate(MatchPredicate.Operator.MATCH_ELEMENT_LE, e1, e2); :}
| expr:e1 KW_MATCH_ELEMENT_GE expr:e2
{: RESULT = new MatchPredicate(MatchPredicate.Operator.MATCH_ELEMENT_GE, e1, e2); :}
;
// Avoid a reduce/reduce conflict with compound_predicate by explicitly
// using non_pred_expr and predicate separately instead of expr.
between_predicate ::=

View File

@ -52,6 +52,7 @@ import org.apache.doris.rewrite.ExprRewriter;
import org.apache.doris.rewrite.ExtractCommonFactorsRule;
import org.apache.doris.rewrite.FoldConstantsRule;
import org.apache.doris.rewrite.InferFiltersRule;
import org.apache.doris.rewrite.MatchPredicateRule;
import org.apache.doris.rewrite.NormalizeBinaryPredicatesRule;
import org.apache.doris.rewrite.RewriteAliasFunctionRule;
import org.apache.doris.rewrite.RewriteBinaryPredicatesRule;
@ -414,6 +415,7 @@ public class Analyzer {
rules.add(RewriteEncryptKeyRule.INSTANCE);
rules.add(RewriteInPredicateRule.INSTANCE);
rules.add(RewriteAliasFunctionRule.INSTANCE);
rules.add(MatchPredicateRule.INSTANCE);
List<ExprRewriteRule> onceRules = Lists.newArrayList();
onceRules.add(ExtractCommonFactorsRule.INSTANCE);
onceRules.add(InferFiltersRule.INSTANCE);

View File

@ -0,0 +1,243 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
package org.apache.doris.analysis;
import org.apache.doris.catalog.ArrayType;
import org.apache.doris.catalog.Function;
import org.apache.doris.catalog.FunctionSet;
import org.apache.doris.catalog.ScalarFunction;
import org.apache.doris.catalog.Type;
import org.apache.doris.common.AnalysisException;
import org.apache.doris.thrift.TExprNode;
import org.apache.doris.thrift.TExprNodeType;
import org.apache.doris.thrift.TExprOpcode;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import java.util.Objects;
/**
* filed MATCH query_str
*/
public class MatchPredicate extends Predicate {
private static final Logger LOG = LogManager.getLogger(MatchPredicate.class);
public enum Operator {
MATCH_ANY("MATCH_ANY", "match_any", TExprOpcode.MATCH_ANY),
MATCH_ALL("MATCH_ALL", "match_all", TExprOpcode.MATCH_ALL),
MATCH_PHRASE("MATCH_PHRASE", "match_phrase", TExprOpcode.MATCH_PHRASE),
MATCH_ELEMENT_EQ("MATCH_ELEMENT_EQ", "match_element_eq", TExprOpcode.MATCH_ELEMENT_EQ),
MATCH_ELEMENT_LT("MATCH_ELEMENT_LT", "match_element_lt", TExprOpcode.MATCH_ELEMENT_LT),
MATCH_ELEMENT_GT("MATCH_ELEMENT_GT", "match_element_gt", TExprOpcode.MATCH_ELEMENT_GT),
MATCH_ELEMENT_LE("MATCH_ELEMENT_LE", "match_element_le", TExprOpcode.MATCH_ELEMENT_LE),
MATCH_ELEMENT_GE("MATCH_ELEMENT_GE", "match_element_ge", TExprOpcode.MATCH_ELEMENT_GE);
private final String description;
private final String name;
private final TExprOpcode opcode;
Operator(String description,
String name,
TExprOpcode opcode) {
this.description = description;
this.name = name;
this.opcode = opcode;
}
@Override
public String toString() {
return description;
}
public String getName() {
return name;
}
public TExprOpcode getOpcode() {
return opcode;
}
}
public static void initBuiltins(FunctionSet functionSet) {
String symbolNotUsed = "symbol_not_used";
for (Type t : Type.getNumericDateTimeTypes()) {
functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator(
Operator.MATCH_ELEMENT_EQ.getName(),
symbolNotUsed,
Lists.<Type>newArrayList(new ArrayType(t), t),
Type.BOOLEAN));
functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator(
Operator.MATCH_ELEMENT_LT.getName(),
symbolNotUsed,
Lists.<Type>newArrayList(new ArrayType(t), t),
Type.BOOLEAN));
functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator(
Operator.MATCH_ELEMENT_GT.getName(),
symbolNotUsed,
Lists.<Type>newArrayList(new ArrayType(t), t),
Type.BOOLEAN));
functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator(
Operator.MATCH_ELEMENT_LE.getName(),
symbolNotUsed,
Lists.<Type>newArrayList(new ArrayType(t), t),
Type.BOOLEAN));
functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator(
Operator.MATCH_ELEMENT_GE.getName(),
symbolNotUsed,
Lists.<Type>newArrayList(new ArrayType(t), t),
Type.BOOLEAN));
}
functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator(
Operator.MATCH_ANY.getName(),
symbolNotUsed,
Lists.<Type>newArrayList(Type.VARCHAR, Type.VARCHAR),
Type.BOOLEAN));
functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator(
Operator.MATCH_ANY.getName(),
symbolNotUsed,
Lists.<Type>newArrayList(new ArrayType(Type.VARCHAR), Type.VARCHAR),
Type.BOOLEAN));
functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator(
Operator.MATCH_ALL.getName(),
symbolNotUsed,
Lists.<Type>newArrayList(Type.VARCHAR, Type.VARCHAR),
Type.BOOLEAN));
functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator(
Operator.MATCH_ALL.getName(),
symbolNotUsed,
Lists.<Type>newArrayList(new ArrayType(Type.VARCHAR), Type.VARCHAR),
Type.BOOLEAN));
functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator(
Operator.MATCH_PHRASE.getName(),
symbolNotUsed,
Lists.<Type>newArrayList(Type.VARCHAR, Type.VARCHAR),
Type.BOOLEAN));
functionSet.addBuiltinBothScalaAndVectorized(ScalarFunction.createBuiltinOperator(
Operator.MATCH_PHRASE.getName(),
symbolNotUsed,
Lists.<Type>newArrayList(new ArrayType(Type.VARCHAR), Type.VARCHAR),
Type.BOOLEAN));
}
private final Operator op;
public MatchPredicate(Operator op, Expr e1, Expr e2) {
super();
this.op = op;
Preconditions.checkNotNull(e1);
children.add(e1);
Preconditions.checkNotNull(e2);
children.add(e2);
// TODO: Calculate selectivity
selectivity = Expr.DEFAULT_SELECTIVITY;
}
public Boolean isMatchElement(Operator op) {
return Objects.equals(op.getName(), Operator.MATCH_ELEMENT_EQ.getName())
|| Objects.equals(op.getName(), Operator.MATCH_ELEMENT_LT.getName())
|| Objects.equals(op.getName(), Operator.MATCH_ELEMENT_GT.getName())
|| Objects.equals(op.getName(), Operator.MATCH_ELEMENT_LE.getName())
|| Objects.equals(op.getName(), Operator.MATCH_ELEMENT_GE.getName());
}
protected MatchPredicate(MatchPredicate other) {
super(other);
op = other.op;
}
@Override
public Expr clone() {
return new MatchPredicate(this);
}
public Operator getOp() {
return this.op;
}
@Override
public boolean equals(Object obj) {
if (!super.equals(obj)) {
return false;
}
return ((MatchPredicate) obj).op == op;
}
@Override
public String toSqlImpl() {
return getChild(0).toSql() + " " + op.toString() + " " + getChild(1).toSql();
}
@Override
protected void toThrift(TExprNode msg) {
msg.node_type = TExprNodeType.MATCH_PRED;
msg.setOpcode(op.getOpcode());
}
@Override
public void analyzeImpl(Analyzer analyzer) throws AnalysisException {
super.analyzeImpl(analyzer);
if (isMatchElement(op) && !getChild(0).getType().isArrayType()) {
throw new AnalysisException(
"left operand of " + op.toString() + " must be Array: " + toSql());
}
if (getChild(0).getType().isObjectStored()) {
throw new AnalysisException(
"left operand of " + op.toString() + " must not be Bitmap or HLL: " + toSql());
}
if (!isMatchElement(op) && !getChild(1).getType().isStringType() && !getChild(1).getType().isNull()) {
throw new AnalysisException("right operand of " + op.toString() + " must be of type STRING: " + toSql());
}
if (!getChild(0).getType().isStringType() && !getChild(0).getType().isArrayType()) {
throw new AnalysisException(
"left operand of " + op.toString() + " must be of type STRING or ARRAY: " + toSql());
}
fn = getBuiltinFunction(op.toString(),
collectChildReturnTypes(), Function.CompareMode.IS_NONSTRICT_SUPERTYPE_OF);
if (fn == null) {
throw new AnalysisException(
"no function found for " + op.toString() + " " + toSql());
}
Expr e1 = getChild(0);
Expr e2 = getChild(1);
// Here we cast match_element_xxx value type from string to array item type.
// Because be need to know the actual TExprNodeType when doing Expr Literal transform
if (isMatchElement(op) && e1.type.isArrayType() && (e2 instanceof StringLiteral)) {
Type itemType = ((ArrayType) e1.type).getItemType();
try {
setChild(1, e2.castTo(itemType));
} catch (NumberFormatException nfe) {
throw new AnalysisException("Invalid number format literal: " + e2.getStringValue());
}
}
}
@Override
public int hashCode() {
return 31 * super.hashCode() + Objects.hashCode(op);
}
}

View File

@ -24,6 +24,7 @@ import org.apache.doris.analysis.CompoundPredicate;
import org.apache.doris.analysis.InPredicate;
import org.apache.doris.analysis.IsNullPredicate;
import org.apache.doris.analysis.LikePredicate;
import org.apache.doris.analysis.MatchPredicate;
import org.apache.doris.builtins.ScalarBuiltins;
import org.apache.doris.catalog.Function.NullableMode;
@ -83,6 +84,7 @@ public class FunctionSet<T> {
IsNullPredicate.initBuiltins(this);
ScalarBuiltins.initBuiltins(this);
LikePredicate.initBuiltins(this);
MatchPredicate.initBuiltins(this);
InPredicate.initBuiltins(this);
AliasFunction.initBuiltins(this);

View File

@ -108,6 +108,7 @@ public abstract class Type {
private static final Logger LOG = LogManager.getLogger(Type.class);
private static final ArrayList<ScalarType> integerTypes;
private static final ArrayList<ScalarType> numericTypes;
private static final ArrayList<ScalarType> numericDateTimeTypes;
private static final ArrayList<ScalarType> supportedTypes;
private static final ArrayList<Type> arraySubTypes;
private static final ArrayList<ScalarType> trivialTypes;
@ -129,6 +130,11 @@ public abstract class Type {
numericTypes.add(DECIMAL64);
numericTypes.add(DECIMAL128);
numericDateTimeTypes = Lists.newArrayList();
numericDateTimeTypes.add(DATE);
numericDateTimeTypes.add(DATETIME);
numericDateTimeTypes.addAll(numericTypes);
trivialTypes = Lists.newArrayList();
trivialTypes.addAll(numericTypes);
trivialTypes.add(BOOLEAN);
@ -176,6 +182,10 @@ public abstract class Type {
return numericTypes;
}
public static ArrayList<ScalarType> getNumericDateTimeTypes() {
return numericDateTimeTypes;
}
public static ArrayList<ScalarType> getTrivialTypes() {
return trivialTypes;
}

View File

@ -0,0 +1,41 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
package org.apache.doris.rewrite;
import org.apache.doris.analysis.Analyzer;
import org.apache.doris.analysis.Expr;
import org.apache.doris.analysis.MatchPredicate;
import org.apache.doris.common.AnalysisException;
/**
* MatchPredicate only support in WHERE_CLAUSE
*/
public final class MatchPredicateRule implements ExprRewriteRule {
public static ExprRewriteRule INSTANCE = new MatchPredicateRule();
@Override
public Expr apply(Expr expr, Analyzer analyzer, ExprRewriter.ClauseType clauseType) throws AnalysisException {
if (expr instanceof MatchPredicate && clauseType != ExprRewriter.ClauseType.WHERE_CLAUSE) {
throw new AnalysisException("Not support in " + clauseType.toString()
+ ", only support in WHERE_CLAUSE, expression: " + expr.toSql());
}
return expr;
}
private MatchPredicateRule() {}
}

View File

@ -295,6 +295,15 @@ import org.apache.doris.qe.SqlModeHelper;
keywordMap.put("lock", new Integer(SqlParserSymbols.KW_LOCK));
keywordMap.put("low_priority", new Integer(SqlParserSymbols.KW_LOW_PRIORITY));
keywordMap.put("map", new Integer(SqlParserSymbols.KW_MAP));
keywordMap.put("match", new Integer(SqlParserSymbols.KW_MATCH));
keywordMap.put("match_any", new Integer(SqlParserSymbols.KW_MATCH_ANY));
keywordMap.put("match_all", new Integer(SqlParserSymbols.KW_MATCH_ALL));
keywordMap.put("match_phrase", new Integer(SqlParserSymbols.KW_MATCH_PHRASE));
keywordMap.put("element_eq", new Integer(SqlParserSymbols.KW_MATCH_ELEMENT_EQ));
keywordMap.put("element_lt", new Integer(SqlParserSymbols.KW_MATCH_ELEMENT_LT));
keywordMap.put("element_gt", new Integer(SqlParserSymbols.KW_MATCH_ELEMENT_GT));
keywordMap.put("element_le", new Integer(SqlParserSymbols.KW_MATCH_ELEMENT_LE));
keywordMap.put("element_ge", new Integer(SqlParserSymbols.KW_MATCH_ELEMENT_GE));
keywordMap.put("materialized", new Integer(SqlParserSymbols.KW_MATERIALIZED));
keywordMap.put("max", new Integer(SqlParserSymbols.KW_MAX));
keywordMap.put("maxvalue", new Integer(SqlParserSymbols.KW_MAX_VALUE));