Add bitmap_hash function (#2439)
Add a bitmap_hash function. Add a murmur_hash3_32 hash function.
This commit is contained in:
@ -97,6 +97,18 @@ StringVal BitmapFunctions::to_bitmap(doris_udf::FunctionContext* ctx, const dori
|
||||
return AnyValUtil::from_string_temp(ctx, buf);
|
||||
}
|
||||
|
||||
StringVal BitmapFunctions::bitmap_hash(doris_udf::FunctionContext* ctx, const doris_udf::StringVal& src) {
|
||||
RoaringBitmap bitmap;
|
||||
if (!src.is_null) {
|
||||
uint32_t hash_value = HashUtil::murmur_hash3_32(src.ptr, src.len, HashUtil::MURMUR3_32_SEED);
|
||||
bitmap.update(hash_value);
|
||||
}
|
||||
std::string buf;
|
||||
buf.resize(bitmap.size());
|
||||
bitmap.serialize((char*)buf.c_str());
|
||||
return AnyValUtil::from_string_temp(ctx, buf);
|
||||
}
|
||||
|
||||
StringVal BitmapFunctions::bitmap_serialize(FunctionContext* ctx, const StringVal& src) {
|
||||
auto* src_bitmap = reinterpret_cast<RoaringBitmap*>(src.ptr);
|
||||
StringVal result(ctx, src_bitmap->size());
|
||||
|
||||
@ -27,6 +27,7 @@ public:
|
||||
static void init();
|
||||
static void bitmap_init(FunctionContext* ctx, StringVal* slot);
|
||||
static StringVal bitmap_empty(FunctionContext* ctx);
|
||||
|
||||
template <typename T>
|
||||
static void bitmap_update_int(FunctionContext* ctx, const T& src, StringVal* dst);
|
||||
// the input src's ptr need to point a RoaringBitmap, this function will release the
|
||||
@ -38,6 +39,7 @@ public:
|
||||
|
||||
static StringVal bitmap_serialize(FunctionContext* ctx, const StringVal& src);
|
||||
static StringVal to_bitmap(FunctionContext* ctx, const StringVal& src);
|
||||
static StringVal bitmap_hash(FunctionContext* ctx, const StringVal& src);
|
||||
};
|
||||
}
|
||||
#endif //DORIS_BE_SRC_QUERY_EXPRS_BITMAP_FUNCTION_H
|
||||
|
||||
@ -109,6 +109,59 @@ public:
|
||||
}
|
||||
#endif
|
||||
|
||||
// refer to https://github.com/apache/commons-codec/blob/master/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java
|
||||
static const uint32_t MURMUR3_32_SEED = 104729;
|
||||
|
||||
ALWAYS_INLINE static uint32_t rotl32(uint32_t x, int8_t r) {
|
||||
return (x << r) | (x >> (32 - r));
|
||||
}
|
||||
|
||||
ALWAYS_INLINE static uint32_t fmix32(uint32_t h) {
|
||||
h ^= h >> 16;
|
||||
h *= 0x85ebca6b;
|
||||
h ^= h >> 13;
|
||||
h *= 0xc2b2ae35;
|
||||
h ^= h >> 16;
|
||||
return h;
|
||||
}
|
||||
|
||||
// modify from https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
|
||||
static uint32_t murmur_hash3_32(const void* key, int32_t len, uint32_t seed) {
|
||||
const uint8_t* data = (const uint8_t*)key;
|
||||
const int nblocks = len / 4;
|
||||
|
||||
uint32_t h1 = seed;
|
||||
|
||||
const uint32_t c1 = 0xcc9e2d51;
|
||||
const uint32_t c2 = 0x1b873593;
|
||||
const uint32_t * blocks = (const uint32_t *)(data + nblocks * 4);
|
||||
|
||||
for(int i = -nblocks; i; i++) {
|
||||
uint32_t k1 = blocks[i];
|
||||
|
||||
k1 *= c1;
|
||||
k1 = rotl32(k1,15);
|
||||
k1 *= c2;
|
||||
|
||||
h1 ^= k1;
|
||||
h1 = rotl32(h1,13);
|
||||
h1 = h1 * 5 + 0xe6546b64;
|
||||
}
|
||||
|
||||
const uint8_t * tail = (const uint8_t*)(data + nblocks * 4);
|
||||
uint32_t k1 = 0;
|
||||
switch(len & 3) {
|
||||
case 3: k1 ^= tail[2] << 16;
|
||||
case 2: k1 ^= tail[1] << 8;
|
||||
case 1: k1 ^= tail[0];
|
||||
k1 *= c1; k1 = rotl32(k1,15); k1 *= c2; h1 ^= k1;
|
||||
};
|
||||
|
||||
h1 ^= len;
|
||||
h1 = fmix32(h1);
|
||||
return h1;
|
||||
}
|
||||
|
||||
static const int MURMUR_R = 47;
|
||||
|
||||
// Murmur2 hash implementation returning 64-bit hashes.
|
||||
|
||||
@ -443,8 +443,10 @@ public class FunctionCallExpr extends Expr {
|
||||
}
|
||||
} else if (getChild(0) instanceof FunctionCallExpr) {
|
||||
FunctionCallExpr functionCallExpr = (FunctionCallExpr) getChild(0);
|
||||
if (!functionCallExpr.getFnName().getFunction().equalsIgnoreCase(FunctionSet.TO_BITMAP)) {
|
||||
throw new AnalysisException("BITMAP_UNION function only support TO_BITMAP function as it's child");
|
||||
String fnName = functionCallExpr.getFnName().getFunction();
|
||||
if (!FunctionSet.BITMAP_LOAD_FNS.contains(fnName)) {
|
||||
throw new AnalysisException("BITMAP_UNION function only support " +
|
||||
"to_bitmap, bitmap_hash or bitmap_union function as it's child");
|
||||
}
|
||||
} else {
|
||||
throw new AnalysisException("BITMAP_UNION only support BITMAP_UNION(column) or BITMAP_UNION(TO_BITMAP(column))");
|
||||
|
||||
@ -643,8 +643,8 @@ public class InsertStmt extends DdlStmt {
|
||||
private void checkBitmapCompatibility(Column col, Expr expr) throws AnalysisException {
|
||||
boolean isCompatible = false;
|
||||
final String bitmapMismatchLog = "Column's type is BITMAP,"
|
||||
+ " SelectList must contains BITMAP column, to_bitmap or bitmap_union" +
|
||||
" or bitmap_empty function's result, column=" + col.getName();
|
||||
+ " SelectList must contains BITMAP column, to_bitmap, bitmap_hash, bitmap_empty" +
|
||||
" or bitmap_union function's result, column=" + col.getName();
|
||||
if (expr instanceof SlotRef) {
|
||||
final SlotRef slot = (SlotRef) expr;
|
||||
Column column = slot.getDesc().getColumn();
|
||||
@ -662,9 +662,10 @@ public class InsertStmt extends DdlStmt {
|
||||
} else if (expr instanceof FunctionCallExpr) {
|
||||
final FunctionCallExpr functionExpr = (FunctionCallExpr) expr;
|
||||
// select id, to_bitmap(id2) from table
|
||||
// select id, bitmap_empty from table
|
||||
if (functionExpr.getFnName().getFunction().equalsIgnoreCase(FunctionSet.TO_BITMAP)
|
||||
|| functionExpr.getFnName().getFunction().equalsIgnoreCase(FunctionSet.BITMAP_EMPTY)) {
|
||||
// select id, bitmap_hash(id) from table
|
||||
// select id, bitmap_empty() from table
|
||||
String fnName = functionExpr.getFnName().getFunction();
|
||||
if (FunctionSet.BITMAP_LOAD_FNS.contains(fnName)) {
|
||||
isCompatible = true;
|
||||
}
|
||||
}
|
||||
|
||||
@ -18,6 +18,7 @@
|
||||
package org.apache.doris.catalog;
|
||||
|
||||
import com.google.common.collect.ImmutableMap;
|
||||
import com.google.common.collect.ImmutableSortedSet;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Maps;
|
||||
import org.apache.doris.analysis.ArithmeticExpr;
|
||||
@ -35,6 +36,7 @@ import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
public class FunctionSet {
|
||||
private static final Logger LOG = LogManager.getLogger(FunctionSet.class);
|
||||
@ -514,9 +516,9 @@ public class FunctionSet {
|
||||
public static final String BITMAP_UNION_COUNT = "bitmap_union_count";
|
||||
public static final String BITMAP_UNION_INT = "bitmap_union_int";
|
||||
public static final String BITMAP_COUNT = "bitmap_count";
|
||||
public static final String BITMAP_EMPTY = "bitmap_empty";
|
||||
public static final String TO_BITMAP = "to_bitmap";
|
||||
|
||||
public static final Set<String> BITMAP_LOAD_FNS = new ImmutableSortedSet.Builder(String.CASE_INSENSITIVE_ORDER)
|
||||
.add("to_bitmap", "bitmap_hash", "bitmap_empty").build();
|
||||
|
||||
private static final Map<Type, String> BITMAP_UNION_INT_SYMBOL =
|
||||
ImmutableMap.<Type, String>builder()
|
||||
|
||||
@ -83,16 +83,14 @@ public abstract class LoadScanNode extends ScanNode {
|
||||
isCompatible = false;
|
||||
} else {
|
||||
FunctionCallExpr fn = (FunctionCallExpr) expr;
|
||||
if (!fn.getFnName().getFunction().equalsIgnoreCase(FunctionSet.TO_BITMAP)
|
||||
&& !fn.getFnName().getFunction().equalsIgnoreCase(FunctionSet.BITMAP_EMPTY)) {
|
||||
String fnName = fn.getFnName().getFunction();
|
||||
if (!FunctionSet.BITMAP_LOAD_FNS.contains(fnName)) {
|
||||
isCompatible = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!isCompatible) {
|
||||
throw new AnalysisException("bitmap column must use to_bitmap or empty_bitmap function, like "
|
||||
+ slotDesc.getColumn().getName() + "=to_bitmap(xxx)"
|
||||
+ slotDesc.getColumn().getName() + "=bitmap_empty()");
|
||||
throw new AnalysisException("bitmap column must use to_bitmap, bitmap_hash or empty_bitmap function");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -602,6 +602,8 @@ visible_functions = [
|
||||
|
||||
[['to_bitmap'], 'VARCHAR', ['VARCHAR'],
|
||||
'_ZN5doris15BitmapFunctions9to_bitmapEPN9doris_udf15FunctionContextERKNS1_9StringValE'],
|
||||
[['bitmap_hash'], 'VARCHAR', ['VARCHAR'],
|
||||
'_ZN5doris15BitmapFunctions11bitmap_hashEPN9doris_udf15FunctionContextERKNS1_9StringValE'],
|
||||
[['bitmap_count'], 'BIGINT', ['VARCHAR'],
|
||||
'_ZN5doris15BitmapFunctions12bitmap_countEPN9doris_udf15FunctionContextERKNS1_9StringValE'],
|
||||
[['bitmap_empty'], 'VARCHAR', [],
|
||||
|
||||
Reference in New Issue
Block a user