[Feature](NGram BloomFilter Index) add new ngram bloom filter index to speed up like query (#11579)

This PR implement  the new bloom filter index: NGram bloom filter index, which was proposed in  #10733.
The new index can improve the like query performance greatly, from our some test case , can  get order of magnitude  improve.
For how to use it you can check the docs in this PR, and the index based on the ```enable_function_pushdown```,
you need set it to ```true```, to make the index work for like query.
This commit is contained in:
Jet He
2022-12-28 18:01:50 +08:00
committed by GitHub
parent 0f8b15b902
commit 75aa00d3d0
44 changed files with 1720 additions and 27 deletions

View File

@ -267,6 +267,7 @@ terminal String
KW_BINLOG,
KW_BITMAP,
KW_BITMAP_UNION,
KW_NGRAM_BF,
KW_BLOB,
KW_BOOLEAN,
KW_BROKER,
@ -2143,7 +2144,7 @@ opt_password_lock_time ::=
| KW_PASSWORD_LOCK_TIME passwd_lock_time_opt:opt
{:
RESULT = opt;
:}
:}
;
passwd_lock_time_opt ::=
@ -3270,6 +3271,10 @@ opt_index_type ::=
{:
RESULT = IndexDef.IndexType.BITMAP;
:}
| KW_USING KW_NGRAM_BF
{:
RESULT = IndexDef.IndexType.NGRAM_BF;
:}
| KW_USING KW_INVERTED
{:
RESULT = IndexDef.IndexType.INVERTED;
@ -5662,10 +5667,10 @@ func_args_def ::=
cast_expr ::=
KW_CAST LPAREN expr:e KW_AS type_def:targetType RPAREN
{:
CastExpr castExpr = new CastExpr(targetType, e);
{:
CastExpr castExpr = new CastExpr(targetType, e);
if (targetType.getType().getLength() != -1
&& (targetType.getType().getPrimitiveType() == PrimitiveType.VARCHAR
&& (targetType.getType().getPrimitiveType() == PrimitiveType.VARCHAR
|| targetType.getType().getPrimitiveType() == PrimitiveType.CHAR)) {
// transfer cast(xx as char(N)/varchar(N)) to substr(cast(xx as char), 1, N)
// this is just a workaround to make the result correct
@ -6504,6 +6509,8 @@ keyword ::=
{: RESULT = id; :}
| KW_BITMAP_UNION:id
{: RESULT = id; :}
| KW_NGRAM_BF:id
{: RESULT = id; :}
| KW_QUANTILE_UNION:id
{: RESULT = id; :}
| KW_BLOB:id

View File

@ -1247,6 +1247,8 @@ public class SchemaChangeHandler extends AlterHandler {
bfFpp = 0;
}
Index.checkConflict(newSet, bfColumns);
// property 3: timeout
long timeoutSecond = PropertyAnalyzer.analyzeTimeout(propertyMap, Config.alter_table_timeout_second);
@ -2058,9 +2060,13 @@ public class SchemaChangeHandler extends AlterHandler {
}
Set<String> existedIdxColSet = Sets.newTreeSet(String.CASE_INSENSITIVE_ORDER);
existedIdxColSet.addAll(existedIdx.getColumns());
if (newColset.equals(existedIdxColSet)) {
if (existedIdx.getIndexType() == indexDef.getIndexType() && newColset.equals(existedIdxColSet)) {
throw new DdlException(
"index for columns (" + String.join(",", indexDef.getColumns()) + " ) already exist.");
indexDef.getIndexType()
+ " index for columns ("
+ String.join(",", indexDef.getColumns())
+ " ) already exist."
);
}
}
@ -2069,7 +2075,7 @@ public class SchemaChangeHandler extends AlterHandler {
if (column != null) {
indexDef.checkColumn(column, olapTable.getKeysType());
} else {
throw new DdlException("BITMAP column does not exist in table. invalid column: " + col);
throw new DdlException("index column does not exist in table. invalid column: " + col);
}
}

View File

@ -37,6 +37,11 @@ public class IndexDef {
private String comment;
private Map<String, String> properties;
public static final String NGRAM_SIZE_KEY = "gram_size";
public static final String NGRAM_BF_SIZE_KEY = "bf_size";
public static final String DEFAULT_NGRAM_SIZE = "2";
public static final String DEFAULT_NGRAM_BF_SIZE = "256";
public IndexDef(String indexName, boolean ifNotExists, List<String> columns, IndexType indexType,
Map<String, String> properties, String comment) {
this.indexName = indexName;
@ -57,6 +62,10 @@ public class IndexDef {
} else {
this.properties = properties;
}
if (indexType == IndexType.NGRAM_BF) {
properties.putIfAbsent(NGRAM_SIZE_KEY, DEFAULT_NGRAM_SIZE);
properties.putIfAbsent(NGRAM_BF_SIZE_KEY, DEFAULT_NGRAM_BF_SIZE);
}
}
public void analyze() throws AnalysisException {
@ -155,6 +164,7 @@ public class IndexDef {
BITMAP,
INVERTED,
BLOOMFILTER,
NGRAM_BF
}
public boolean isInvertedIndex() {
@ -162,7 +172,8 @@ public class IndexDef {
}
public void checkColumn(Column column, KeysType keysType) throws AnalysisException {
if (indexType == IndexType.BITMAP || indexType == IndexType.INVERTED || indexType == IndexType.BLOOMFILTER) {
if (indexType == IndexType.BITMAP || indexType == IndexType.INVERTED || indexType == IndexType.BLOOMFILTER
|| indexType == IndexType.NGRAM_BF) {
String indexColName = column.getName();
PrimitiveType colType = column.getDataType();
if (!(colType.isDateType() || colType.isDecimalV2Type() || colType.isDecimalV3Type()
@ -177,6 +188,31 @@ public class IndexDef {
if (indexType == IndexType.INVERTED) {
InvertedIndexUtil.checkInvertedIndexParser(indexColName, colType, properties);
} else if (indexType == IndexType.NGRAM_BF) {
if (colType != PrimitiveType.CHAR && colType != PrimitiveType.VARCHAR
&& colType != PrimitiveType.STRING) {
throw new AnalysisException(colType + " is not supported in ngram_bf index. "
+ "invalid column: " + indexColName);
} else if ((keysType == KeysType.AGG_KEYS && !column.isKey())) {
throw new AnalysisException(
"ngram_bf index only used in columns of DUP_KEYS/UNIQUE_KEYS table or key columns of"
+ " AGG_KEYS table. invalid column: " + indexColName);
}
if (properties.size() != 2) {
throw new AnalysisException("ngram_bf index should have gram_size and bf_size properties");
}
try {
int ngramSize = Integer.parseInt(properties.get(NGRAM_SIZE_KEY));
int bfSize = Integer.parseInt(properties.get(NGRAM_BF_SIZE_KEY));
if (ngramSize > 256 || ngramSize < 1) {
throw new AnalysisException("gram_size should be integer and less than 256");
}
if (bfSize > 65536 || bfSize < 64) {
throw new AnalysisException("bf_size should be integer and between 64 and 65536");
}
} catch (NumberFormatException e) {
throw new AnalysisException("invalid ngram properties:" + e.getMessage(), e);
}
}
} else {
throw new AnalysisException("Unsupported index type: " + indexType);

View File

@ -19,6 +19,7 @@ package org.apache.doris.catalog;
import org.apache.doris.analysis.IndexDef;
import org.apache.doris.analysis.InvertedIndexUtil;
import org.apache.doris.common.AnalysisException;
import org.apache.doris.common.io.Text;
import org.apache.doris.common.io.Writable;
import org.apache.doris.common.util.PrintableMap;
@ -32,9 +33,13 @@ import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
/**
* Internal representation of index, including index type, name, columns and comments.
@ -197,4 +202,31 @@ public class Index implements Writable {
}
return tIndex;
}
public static void checkConflict(Collection<Index> indices, Set<String> bloomFilters) throws AnalysisException {
indices = indices == null ? Collections.emptyList() : indices;
bloomFilters = bloomFilters == null ? Collections.emptySet() : bloomFilters;
Set<String> bfColumns = new HashSet<>();
for (Index index : indices) {
if (IndexDef.IndexType.NGRAM_BF == index.getIndexType()
|| IndexDef.IndexType.BLOOMFILTER == index.getIndexType()) {
for (String column : index.getColumns()) {
column = column.toLowerCase();
if (bfColumns.contains(column)) {
throw new AnalysisException(column + " should have only one ngram bloom filter index or bloom "
+ "filter index");
}
bfColumns.add(column);
}
}
}
for (String column : bloomFilters) {
column = column.toLowerCase();
if (bfColumns.contains(column)) {
throw new AnalysisException(column + " should have only one ngram bloom filter index or bloom "
+ "filter index");
}
bfColumns.add(column);
}
}
}

View File

@ -1918,6 +1918,8 @@ public class InternalCatalog implements CatalogIf<Database> {
throw new DdlException(e.getMessage());
}
Index.checkConflict(stmt.getIndexes(), bfColumns);
olapTable.setReplicationAllocation(replicaAlloc);
// set in memory

View File

@ -120,6 +120,7 @@ import org.apache.doris.qe.SqlModeHelper;
keywordMap.put("bitmap", new Integer(SqlParserSymbols.KW_BITMAP));
keywordMap.put("inverted", new Integer(SqlParserSymbols.KW_INVERTED));
keywordMap.put("bitmap_union", new Integer(SqlParserSymbols.KW_BITMAP_UNION));
keywordMap.put("ngram_bf", new Integer(SqlParserSymbols.KW_NGRAM_BF));
keywordMap.put("blob", new Integer(SqlParserSymbols.KW_BLOB));
keywordMap.put("boolean", new Integer(SqlParserSymbols.KW_BOOLEAN));
keywordMap.put("broker", new Integer(SqlParserSymbols.KW_BROKER));