Add bitmap_hash function (#2439)

Add a bitmap_hash function.
Add a murmur_hash3_32 hash function.
This commit is contained in:
kangkaisen
2019-12-12 16:55:07 +08:00
committed by ZHAO Chun
parent ded247f001
commit a5f52f80df
8 changed files with 86 additions and 14 deletions

View File

@ -97,6 +97,18 @@ StringVal BitmapFunctions::to_bitmap(doris_udf::FunctionContext* ctx, const dori
return AnyValUtil::from_string_temp(ctx, buf);
}
StringVal BitmapFunctions::bitmap_hash(doris_udf::FunctionContext* ctx, const doris_udf::StringVal& src) {
RoaringBitmap bitmap;
if (!src.is_null) {
uint32_t hash_value = HashUtil::murmur_hash3_32(src.ptr, src.len, HashUtil::MURMUR3_32_SEED);
bitmap.update(hash_value);
}
std::string buf;
buf.resize(bitmap.size());
bitmap.serialize((char*)buf.c_str());
return AnyValUtil::from_string_temp(ctx, buf);
}
StringVal BitmapFunctions::bitmap_serialize(FunctionContext* ctx, const StringVal& src) {
auto* src_bitmap = reinterpret_cast<RoaringBitmap*>(src.ptr);
StringVal result(ctx, src_bitmap->size());

View File

@ -27,6 +27,7 @@ public:
static void init();
static void bitmap_init(FunctionContext* ctx, StringVal* slot);
static StringVal bitmap_empty(FunctionContext* ctx);
template <typename T>
static void bitmap_update_int(FunctionContext* ctx, const T& src, StringVal* dst);
// the input src's ptr need to point a RoaringBitmap, this function will release the
@ -38,6 +39,7 @@ public:
static StringVal bitmap_serialize(FunctionContext* ctx, const StringVal& src);
static StringVal to_bitmap(FunctionContext* ctx, const StringVal& src);
static StringVal bitmap_hash(FunctionContext* ctx, const StringVal& src);
};
}
#endif //DORIS_BE_SRC_QUERY_EXPRS_BITMAP_FUNCTION_H

View File

@ -109,6 +109,59 @@ public:
}
#endif
// refer to https://github.com/apache/commons-codec/blob/master/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
static const uint32_t MURMUR3_32_SEED = 104729;
ALWAYS_INLINE static uint32_t rotl32(uint32_t x, int8_t r) {
return (x << r) | (x >> (32 - r));
}
ALWAYS_INLINE static uint32_t fmix32(uint32_t h) {
h ^= h >> 16;
h *= 0x85ebca6b;
h ^= h >> 13;
h *= 0xc2b2ae35;
h ^= h >> 16;
return h;
}
// modify from https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
static uint32_t murmur_hash3_32(const void* key, int32_t len, uint32_t seed) {
const uint8_t* data = (const uint8_t*)key;
const int nblocks = len / 4;
uint32_t h1 = seed;
const uint32_t c1 = 0xcc9e2d51;
const uint32_t c2 = 0x1b873593;
const uint32_t * blocks = (const uint32_t *)(data + nblocks * 4);
for(int i = -nblocks; i; i++) {
uint32_t k1 = blocks[i];
k1 *= c1;
k1 = rotl32(k1,15);
k1 *= c2;
h1 ^= k1;
h1 = rotl32(h1,13);
h1 = h1 * 5 + 0xe6546b64;
}
const uint8_t * tail = (const uint8_t*)(data + nblocks * 4);
uint32_t k1 = 0;
switch(len & 3) {
case 3: k1 ^= tail[2] << 16;
case 2: k1 ^= tail[1] << 8;
case 1: k1 ^= tail[0];
k1 *= c1; k1 = rotl32(k1,15); k1 *= c2; h1 ^= k1;
};
h1 ^= len;
h1 = fmix32(h1);
return h1;
}
static const int MURMUR_R = 47;
// Murmur2 hash implementation returning 64-bit hashes.

View File

@ -443,8 +443,10 @@ public class FunctionCallExpr extends Expr {
}
} else if (getChild(0) instanceof FunctionCallExpr) {
FunctionCallExpr functionCallExpr = (FunctionCallExpr) getChild(0);
if (!functionCallExpr.getFnName().getFunction().equalsIgnoreCase(FunctionSet.TO_BITMAP)) {
throw new AnalysisException("BITMAP_UNION function only support TO_BITMAP function as it's child");
String fnName = functionCallExpr.getFnName().getFunction();
if (!FunctionSet.BITMAP_LOAD_FNS.contains(fnName)) {
throw new AnalysisException("BITMAP_UNION function only support " +
"to_bitmap, bitmap_hash or bitmap_union function as it's child");
}
} else {
throw new AnalysisException("BITMAP_UNION only support BITMAP_UNION(column) or BITMAP_UNION(TO_BITMAP(column))");

View File

@ -643,8 +643,8 @@ public class InsertStmt extends DdlStmt {
private void checkBitmapCompatibility(Column col, Expr expr) throws AnalysisException {
boolean isCompatible = false;
final String bitmapMismatchLog = "Column's type is BITMAP,"
+ " SelectList must contains BITMAP column, to_bitmap or bitmap_union" +
" or bitmap_empty function's result, column=" + col.getName();
+ " SelectList must contains BITMAP column, to_bitmap, bitmap_hash, bitmap_empty" +
" or bitmap_union function's result, column=" + col.getName();
if (expr instanceof SlotRef) {
final SlotRef slot = (SlotRef) expr;
Column column = slot.getDesc().getColumn();
@ -662,9 +662,10 @@ public class InsertStmt extends DdlStmt {
} else if (expr instanceof FunctionCallExpr) {
final FunctionCallExpr functionExpr = (FunctionCallExpr) expr;
// select id, to_bitmap(id2) from table
// select id, bitmap_empty from table
if (functionExpr.getFnName().getFunction().equalsIgnoreCase(FunctionSet.TO_BITMAP)
|| functionExpr.getFnName().getFunction().equalsIgnoreCase(FunctionSet.BITMAP_EMPTY)) {
// select id, bitmap_hash(id) from table
// select id, bitmap_empty() from table
String fnName = functionExpr.getFnName().getFunction();
if (FunctionSet.BITMAP_LOAD_FNS.contains(fnName)) {
isCompatible = true;
}
}

View File

@ -18,6 +18,7 @@
package org.apache.doris.catalog;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSortedSet;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import org.apache.doris.analysis.ArithmeticExpr;
@ -35,6 +36,7 @@ import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
public class FunctionSet {
private static final Logger LOG = LogManager.getLogger(FunctionSet.class);
@ -514,9 +516,9 @@ public class FunctionSet {
public static final String BITMAP_UNION_COUNT = "bitmap_union_count";
public static final String BITMAP_UNION_INT = "bitmap_union_int";
public static final String BITMAP_COUNT = "bitmap_count";
public static final String BITMAP_EMPTY = "bitmap_empty";
public static final String TO_BITMAP = "to_bitmap";
public static final Set<String> BITMAP_LOAD_FNS = new ImmutableSortedSet.Builder(String.CASE_INSENSITIVE_ORDER)
.add("to_bitmap", "bitmap_hash", "bitmap_empty").build();
private static final Map<Type, String> BITMAP_UNION_INT_SYMBOL =
ImmutableMap.<Type, String>builder()

View File

@ -83,16 +83,14 @@ public abstract class LoadScanNode extends ScanNode {
isCompatible = false;
} else {
FunctionCallExpr fn = (FunctionCallExpr) expr;
if (!fn.getFnName().getFunction().equalsIgnoreCase(FunctionSet.TO_BITMAP)
&& !fn.getFnName().getFunction().equalsIgnoreCase(FunctionSet.BITMAP_EMPTY)) {
String fnName = fn.getFnName().getFunction();
if (!FunctionSet.BITMAP_LOAD_FNS.contains(fnName)) {
isCompatible = false;
}
}
}
if (!isCompatible) {
throw new AnalysisException("bitmap column must use to_bitmap or empty_bitmap function, like "
+ slotDesc.getColumn().getName() + "=to_bitmap(xxx)"
+ slotDesc.getColumn().getName() + "=bitmap_empty()");
throw new AnalysisException("bitmap column must use to_bitmap, bitmap_hash or empty_bitmap function");
}
}

View File

@ -602,6 +602,8 @@ visible_functions = [
[['to_bitmap'], 'VARCHAR', ['VARCHAR'],
'_ZN5doris15BitmapFunctions9to_bitmapEPN9doris_udf15FunctionContextERKNS1_9StringValE'],
[['bitmap_hash'], 'VARCHAR', ['VARCHAR'],
'_ZN5doris15BitmapFunctions11bitmap_hashEPN9doris_udf15FunctionContextERKNS1_9StringValE'],
[['bitmap_count'], 'BIGINT', ['VARCHAR'],
'_ZN5doris15BitmapFunctions12bitmap_countEPN9doris_udf15FunctionContextERKNS1_9StringValE'],
[['bitmap_empty'], 'VARCHAR', [],