[Feature](NGram BloomFilter Index) add new ngram bloom filter index to speed up like query (#11579)
This PR implement the new bloom filter index: NGram bloom filter index, which was proposed in #10733. The new index can improve the like query performance greatly, from our some test case , can get order of magnitude improve. For how to use it you can check the docs in this PR, and the index based on the ```enable_function_pushdown```, you need set it to ```true```, to make the index work for like query.
This commit is contained in:
@ -267,6 +267,7 @@ terminal String
|
||||
KW_BINLOG,
|
||||
KW_BITMAP,
|
||||
KW_BITMAP_UNION,
|
||||
KW_NGRAM_BF,
|
||||
KW_BLOB,
|
||||
KW_BOOLEAN,
|
||||
KW_BROKER,
|
||||
@ -2143,7 +2144,7 @@ opt_password_lock_time ::=
|
||||
| KW_PASSWORD_LOCK_TIME passwd_lock_time_opt:opt
|
||||
{:
|
||||
RESULT = opt;
|
||||
:}
|
||||
:}
|
||||
;
|
||||
|
||||
passwd_lock_time_opt ::=
|
||||
@ -3270,6 +3271,10 @@ opt_index_type ::=
|
||||
{:
|
||||
RESULT = IndexDef.IndexType.BITMAP;
|
||||
:}
|
||||
| KW_USING KW_NGRAM_BF
|
||||
{:
|
||||
RESULT = IndexDef.IndexType.NGRAM_BF;
|
||||
:}
|
||||
| KW_USING KW_INVERTED
|
||||
{:
|
||||
RESULT = IndexDef.IndexType.INVERTED;
|
||||
@ -5662,10 +5667,10 @@ func_args_def ::=
|
||||
|
||||
cast_expr ::=
|
||||
KW_CAST LPAREN expr:e KW_AS type_def:targetType RPAREN
|
||||
{:
|
||||
CastExpr castExpr = new CastExpr(targetType, e);
|
||||
{:
|
||||
CastExpr castExpr = new CastExpr(targetType, e);
|
||||
if (targetType.getType().getLength() != -1
|
||||
&& (targetType.getType().getPrimitiveType() == PrimitiveType.VARCHAR
|
||||
&& (targetType.getType().getPrimitiveType() == PrimitiveType.VARCHAR
|
||||
|| targetType.getType().getPrimitiveType() == PrimitiveType.CHAR)) {
|
||||
// transfer cast(xx as char(N)/varchar(N)) to substr(cast(xx as char), 1, N)
|
||||
// this is just a workaround to make the result correct
|
||||
@ -6504,6 +6509,8 @@ keyword ::=
|
||||
{: RESULT = id; :}
|
||||
| KW_BITMAP_UNION:id
|
||||
{: RESULT = id; :}
|
||||
| KW_NGRAM_BF:id
|
||||
{: RESULT = id; :}
|
||||
| KW_QUANTILE_UNION:id
|
||||
{: RESULT = id; :}
|
||||
| KW_BLOB:id
|
||||
|
||||
@ -1247,6 +1247,8 @@ public class SchemaChangeHandler extends AlterHandler {
|
||||
bfFpp = 0;
|
||||
}
|
||||
|
||||
Index.checkConflict(newSet, bfColumns);
|
||||
|
||||
// property 3: timeout
|
||||
long timeoutSecond = PropertyAnalyzer.analyzeTimeout(propertyMap, Config.alter_table_timeout_second);
|
||||
|
||||
@ -2058,9 +2060,13 @@ public class SchemaChangeHandler extends AlterHandler {
|
||||
}
|
||||
Set<String> existedIdxColSet = Sets.newTreeSet(String.CASE_INSENSITIVE_ORDER);
|
||||
existedIdxColSet.addAll(existedIdx.getColumns());
|
||||
if (newColset.equals(existedIdxColSet)) {
|
||||
if (existedIdx.getIndexType() == indexDef.getIndexType() && newColset.equals(existedIdxColSet)) {
|
||||
throw new DdlException(
|
||||
"index for columns (" + String.join(",", indexDef.getColumns()) + " ) already exist.");
|
||||
indexDef.getIndexType()
|
||||
+ " index for columns ("
|
||||
+ String.join(",", indexDef.getColumns())
|
||||
+ " ) already exist."
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@ -2069,7 +2075,7 @@ public class SchemaChangeHandler extends AlterHandler {
|
||||
if (column != null) {
|
||||
indexDef.checkColumn(column, olapTable.getKeysType());
|
||||
} else {
|
||||
throw new DdlException("BITMAP column does not exist in table. invalid column: " + col);
|
||||
throw new DdlException("index column does not exist in table. invalid column: " + col);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -37,6 +37,11 @@ public class IndexDef {
|
||||
private String comment;
|
||||
private Map<String, String> properties;
|
||||
|
||||
public static final String NGRAM_SIZE_KEY = "gram_size";
|
||||
public static final String NGRAM_BF_SIZE_KEY = "bf_size";
|
||||
public static final String DEFAULT_NGRAM_SIZE = "2";
|
||||
public static final String DEFAULT_NGRAM_BF_SIZE = "256";
|
||||
|
||||
public IndexDef(String indexName, boolean ifNotExists, List<String> columns, IndexType indexType,
|
||||
Map<String, String> properties, String comment) {
|
||||
this.indexName = indexName;
|
||||
@ -57,6 +62,10 @@ public class IndexDef {
|
||||
} else {
|
||||
this.properties = properties;
|
||||
}
|
||||
if (indexType == IndexType.NGRAM_BF) {
|
||||
properties.putIfAbsent(NGRAM_SIZE_KEY, DEFAULT_NGRAM_SIZE);
|
||||
properties.putIfAbsent(NGRAM_BF_SIZE_KEY, DEFAULT_NGRAM_BF_SIZE);
|
||||
}
|
||||
}
|
||||
|
||||
public void analyze() throws AnalysisException {
|
||||
@ -155,6 +164,7 @@ public class IndexDef {
|
||||
BITMAP,
|
||||
INVERTED,
|
||||
BLOOMFILTER,
|
||||
NGRAM_BF
|
||||
}
|
||||
|
||||
public boolean isInvertedIndex() {
|
||||
@ -162,7 +172,8 @@ public class IndexDef {
|
||||
}
|
||||
|
||||
public void checkColumn(Column column, KeysType keysType) throws AnalysisException {
|
||||
if (indexType == IndexType.BITMAP || indexType == IndexType.INVERTED || indexType == IndexType.BLOOMFILTER) {
|
||||
if (indexType == IndexType.BITMAP || indexType == IndexType.INVERTED || indexType == IndexType.BLOOMFILTER
|
||||
|| indexType == IndexType.NGRAM_BF) {
|
||||
String indexColName = column.getName();
|
||||
PrimitiveType colType = column.getDataType();
|
||||
if (!(colType.isDateType() || colType.isDecimalV2Type() || colType.isDecimalV3Type()
|
||||
@ -177,6 +188,31 @@ public class IndexDef {
|
||||
|
||||
if (indexType == IndexType.INVERTED) {
|
||||
InvertedIndexUtil.checkInvertedIndexParser(indexColName, colType, properties);
|
||||
} else if (indexType == IndexType.NGRAM_BF) {
|
||||
if (colType != PrimitiveType.CHAR && colType != PrimitiveType.VARCHAR
|
||||
&& colType != PrimitiveType.STRING) {
|
||||
throw new AnalysisException(colType + " is not supported in ngram_bf index. "
|
||||
+ "invalid column: " + indexColName);
|
||||
} else if ((keysType == KeysType.AGG_KEYS && !column.isKey())) {
|
||||
throw new AnalysisException(
|
||||
"ngram_bf index only used in columns of DUP_KEYS/UNIQUE_KEYS table or key columns of"
|
||||
+ " AGG_KEYS table. invalid column: " + indexColName);
|
||||
}
|
||||
if (properties.size() != 2) {
|
||||
throw new AnalysisException("ngram_bf index should have gram_size and bf_size properties");
|
||||
}
|
||||
try {
|
||||
int ngramSize = Integer.parseInt(properties.get(NGRAM_SIZE_KEY));
|
||||
int bfSize = Integer.parseInt(properties.get(NGRAM_BF_SIZE_KEY));
|
||||
if (ngramSize > 256 || ngramSize < 1) {
|
||||
throw new AnalysisException("gram_size should be integer and less than 256");
|
||||
}
|
||||
if (bfSize > 65536 || bfSize < 64) {
|
||||
throw new AnalysisException("bf_size should be integer and between 64 and 65536");
|
||||
}
|
||||
} catch (NumberFormatException e) {
|
||||
throw new AnalysisException("invalid ngram properties:" + e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
throw new AnalysisException("Unsupported index type: " + indexType);
|
||||
|
||||
@ -19,6 +19,7 @@ package org.apache.doris.catalog;
|
||||
|
||||
import org.apache.doris.analysis.IndexDef;
|
||||
import org.apache.doris.analysis.InvertedIndexUtil;
|
||||
import org.apache.doris.common.AnalysisException;
|
||||
import org.apache.doris.common.io.Text;
|
||||
import org.apache.doris.common.io.Writable;
|
||||
import org.apache.doris.common.util.PrintableMap;
|
||||
@ -32,9 +33,13 @@ import java.io.DataInput;
|
||||
import java.io.DataOutput;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Internal representation of index, including index type, name, columns and comments.
|
||||
@ -197,4 +202,31 @@ public class Index implements Writable {
|
||||
}
|
||||
return tIndex;
|
||||
}
|
||||
|
||||
public static void checkConflict(Collection<Index> indices, Set<String> bloomFilters) throws AnalysisException {
|
||||
indices = indices == null ? Collections.emptyList() : indices;
|
||||
bloomFilters = bloomFilters == null ? Collections.emptySet() : bloomFilters;
|
||||
Set<String> bfColumns = new HashSet<>();
|
||||
for (Index index : indices) {
|
||||
if (IndexDef.IndexType.NGRAM_BF == index.getIndexType()
|
||||
|| IndexDef.IndexType.BLOOMFILTER == index.getIndexType()) {
|
||||
for (String column : index.getColumns()) {
|
||||
column = column.toLowerCase();
|
||||
if (bfColumns.contains(column)) {
|
||||
throw new AnalysisException(column + " should have only one ngram bloom filter index or bloom "
|
||||
+ "filter index");
|
||||
}
|
||||
bfColumns.add(column);
|
||||
}
|
||||
}
|
||||
}
|
||||
for (String column : bloomFilters) {
|
||||
column = column.toLowerCase();
|
||||
if (bfColumns.contains(column)) {
|
||||
throw new AnalysisException(column + " should have only one ngram bloom filter index or bloom "
|
||||
+ "filter index");
|
||||
}
|
||||
bfColumns.add(column);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -1918,6 +1918,8 @@ public class InternalCatalog implements CatalogIf<Database> {
|
||||
throw new DdlException(e.getMessage());
|
||||
}
|
||||
|
||||
Index.checkConflict(stmt.getIndexes(), bfColumns);
|
||||
|
||||
olapTable.setReplicationAllocation(replicaAlloc);
|
||||
|
||||
// set in memory
|
||||
|
||||
@ -120,6 +120,7 @@ import org.apache.doris.qe.SqlModeHelper;
|
||||
keywordMap.put("bitmap", new Integer(SqlParserSymbols.KW_BITMAP));
|
||||
keywordMap.put("inverted", new Integer(SqlParserSymbols.KW_INVERTED));
|
||||
keywordMap.put("bitmap_union", new Integer(SqlParserSymbols.KW_BITMAP_UNION));
|
||||
keywordMap.put("ngram_bf", new Integer(SqlParserSymbols.KW_NGRAM_BF));
|
||||
keywordMap.put("blob", new Integer(SqlParserSymbols.KW_BLOB));
|
||||
keywordMap.put("boolean", new Integer(SqlParserSymbols.KW_BOOLEAN));
|
||||
keywordMap.put("broker", new Integer(SqlParserSymbols.KW_BROKER));
|
||||
|
||||
Reference in New Issue
Block a user