From a5f52f80dff0cb1e36cebdd3957e8ecb53398265 Mon Sep 17 00:00:00 2001 From: kangkaisen Date: Thu, 12 Dec 2019 16:55:07 +0800 Subject: [PATCH] Add bitmap_hash function (#2439) Add a bitmap_hash function. Add a murmur_hash3_32 hash function. --- be/src/exprs/bitmap_function.cpp | 12 +++++ be/src/exprs/bitmap_function.h | 2 + be/src/util/hash_util.hpp | 53 +++++++++++++++++++ .../doris/analysis/FunctionCallExpr.java | 6 ++- .../org/apache/doris/analysis/InsertStmt.java | 11 ++-- .../org/apache/doris/catalog/FunctionSet.java | 6 ++- .../apache/doris/planner/LoadScanNode.java | 8 ++- gensrc/script/doris_builtins_functions.py | 2 + 8 files changed, 86 insertions(+), 14 deletions(-) diff --git a/be/src/exprs/bitmap_function.cpp b/be/src/exprs/bitmap_function.cpp index 2f1942a63e..bc03ab24e3 100644 --- a/be/src/exprs/bitmap_function.cpp +++ b/be/src/exprs/bitmap_function.cpp @@ -97,6 +97,18 @@ StringVal BitmapFunctions::to_bitmap(doris_udf::FunctionContext* ctx, const dori return AnyValUtil::from_string_temp(ctx, buf); } +StringVal BitmapFunctions::bitmap_hash(doris_udf::FunctionContext* ctx, const doris_udf::StringVal& src) { + RoaringBitmap bitmap; + if (!src.is_null) { + uint32_t hash_value = HashUtil::murmur_hash3_32(src.ptr, src.len, HashUtil::MURMUR3_32_SEED); + bitmap.update(hash_value); + } + std::string buf; + buf.resize(bitmap.size()); + bitmap.serialize((char*)buf.c_str()); + return AnyValUtil::from_string_temp(ctx, buf); +} + StringVal BitmapFunctions::bitmap_serialize(FunctionContext* ctx, const StringVal& src) { auto* src_bitmap = reinterpret_cast(src.ptr); StringVal result(ctx, src_bitmap->size()); diff --git a/be/src/exprs/bitmap_function.h b/be/src/exprs/bitmap_function.h index a8fa9238e2..aef5016dd2 100644 --- a/be/src/exprs/bitmap_function.h +++ b/be/src/exprs/bitmap_function.h @@ -27,6 +27,7 @@ public: static void init(); static void bitmap_init(FunctionContext* ctx, StringVal* slot); static StringVal bitmap_empty(FunctionContext* ctx); + template static void bitmap_update_int(FunctionContext* ctx, const T& src, StringVal* dst); // the input src's ptr need to point a RoaringBitmap, this function will release the @@ -38,6 +39,7 @@ public: static StringVal bitmap_serialize(FunctionContext* ctx, const StringVal& src); static StringVal to_bitmap(FunctionContext* ctx, const StringVal& src); + static StringVal bitmap_hash(FunctionContext* ctx, const StringVal& src); }; } #endif //DORIS_BE_SRC_QUERY_EXPRS_BITMAP_FUNCTION_H diff --git a/be/src/util/hash_util.hpp b/be/src/util/hash_util.hpp index f5e9ce8250..ccbc79bfa8 100644 --- a/be/src/util/hash_util.hpp +++ b/be/src/util/hash_util.hpp @@ -109,6 +109,59 @@ public: } #endif + // refer to https://github.com/apache/commons-codec/blob/master/src/main/java/org/apache/commons/codec/digest/MurmurHash3.java + static const uint32_t MURMUR3_32_SEED = 104729; + + ALWAYS_INLINE static uint32_t rotl32(uint32_t x, int8_t r) { + return (x << r) | (x >> (32 - r)); + } + + ALWAYS_INLINE static uint32_t fmix32(uint32_t h) { + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + return h; + } + + // modify from https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp + static uint32_t murmur_hash3_32(const void* key, int32_t len, uint32_t seed) { + const uint8_t* data = (const uint8_t*)key; + const int nblocks = len / 4; + + uint32_t h1 = seed; + + const uint32_t c1 = 0xcc9e2d51; + const uint32_t c2 = 0x1b873593; + const uint32_t * blocks = (const uint32_t *)(data + nblocks * 4); + + for(int i = -nblocks; i; i++) { + uint32_t k1 = blocks[i]; + + k1 *= c1; + k1 = rotl32(k1,15); + k1 *= c2; + + h1 ^= k1; + h1 = rotl32(h1,13); + h1 = h1 * 5 + 0xe6546b64; + } + + const uint8_t * tail = (const uint8_t*)(data + nblocks * 4); + uint32_t k1 = 0; + switch(len & 3) { + case 3: k1 ^= tail[2] << 16; + case 2: k1 ^= tail[1] << 8; + case 1: k1 ^= tail[0]; + k1 *= c1; k1 = rotl32(k1,15); k1 *= c2; h1 ^= k1; + }; + + h1 ^= len; + h1 = fmix32(h1); + return h1; + } + static const int MURMUR_R = 47; // Murmur2 hash implementation returning 64-bit hashes. diff --git a/fe/src/main/java/org/apache/doris/analysis/FunctionCallExpr.java b/fe/src/main/java/org/apache/doris/analysis/FunctionCallExpr.java index 86287ae276..5eb9d6dec8 100644 --- a/fe/src/main/java/org/apache/doris/analysis/FunctionCallExpr.java +++ b/fe/src/main/java/org/apache/doris/analysis/FunctionCallExpr.java @@ -443,8 +443,10 @@ public class FunctionCallExpr extends Expr { } } else if (getChild(0) instanceof FunctionCallExpr) { FunctionCallExpr functionCallExpr = (FunctionCallExpr) getChild(0); - if (!functionCallExpr.getFnName().getFunction().equalsIgnoreCase(FunctionSet.TO_BITMAP)) { - throw new AnalysisException("BITMAP_UNION function only support TO_BITMAP function as it's child"); + String fnName = functionCallExpr.getFnName().getFunction(); + if (!FunctionSet.BITMAP_LOAD_FNS.contains(fnName)) { + throw new AnalysisException("BITMAP_UNION function only support " + + "to_bitmap, bitmap_hash or bitmap_union function as it's child"); } } else { throw new AnalysisException("BITMAP_UNION only support BITMAP_UNION(column) or BITMAP_UNION(TO_BITMAP(column))"); diff --git a/fe/src/main/java/org/apache/doris/analysis/InsertStmt.java b/fe/src/main/java/org/apache/doris/analysis/InsertStmt.java index 9e0f79d8b3..20e639f202 100644 --- a/fe/src/main/java/org/apache/doris/analysis/InsertStmt.java +++ b/fe/src/main/java/org/apache/doris/analysis/InsertStmt.java @@ -643,8 +643,8 @@ public class InsertStmt extends DdlStmt { private void checkBitmapCompatibility(Column col, Expr expr) throws AnalysisException { boolean isCompatible = false; final String bitmapMismatchLog = "Column's type is BITMAP," - + " SelectList must contains BITMAP column, to_bitmap or bitmap_union" + - " or bitmap_empty function's result, column=" + col.getName(); + + " SelectList must contains BITMAP column, to_bitmap, bitmap_hash, bitmap_empty" + + " or bitmap_union function's result, column=" + col.getName(); if (expr instanceof SlotRef) { final SlotRef slot = (SlotRef) expr; Column column = slot.getDesc().getColumn(); @@ -662,9 +662,10 @@ public class InsertStmt extends DdlStmt { } else if (expr instanceof FunctionCallExpr) { final FunctionCallExpr functionExpr = (FunctionCallExpr) expr; // select id, to_bitmap(id2) from table - // select id, bitmap_empty from table - if (functionExpr.getFnName().getFunction().equalsIgnoreCase(FunctionSet.TO_BITMAP) - || functionExpr.getFnName().getFunction().equalsIgnoreCase(FunctionSet.BITMAP_EMPTY)) { + // select id, bitmap_hash(id) from table + // select id, bitmap_empty() from table + String fnName = functionExpr.getFnName().getFunction(); + if (FunctionSet.BITMAP_LOAD_FNS.contains(fnName)) { isCompatible = true; } } diff --git a/fe/src/main/java/org/apache/doris/catalog/FunctionSet.java b/fe/src/main/java/org/apache/doris/catalog/FunctionSet.java index 648293422d..5b94c72451 100644 --- a/fe/src/main/java/org/apache/doris/catalog/FunctionSet.java +++ b/fe/src/main/java/org/apache/doris/catalog/FunctionSet.java @@ -18,6 +18,7 @@ package org.apache.doris.catalog; import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSortedSet; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import org.apache.doris.analysis.ArithmeticExpr; @@ -35,6 +36,7 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Set; public class FunctionSet { private static final Logger LOG = LogManager.getLogger(FunctionSet.class); @@ -514,9 +516,9 @@ public class FunctionSet { public static final String BITMAP_UNION_COUNT = "bitmap_union_count"; public static final String BITMAP_UNION_INT = "bitmap_union_int"; public static final String BITMAP_COUNT = "bitmap_count"; - public static final String BITMAP_EMPTY = "bitmap_empty"; - public static final String TO_BITMAP = "to_bitmap"; + public static final Set BITMAP_LOAD_FNS = new ImmutableSortedSet.Builder(String.CASE_INSENSITIVE_ORDER) + .add("to_bitmap", "bitmap_hash", "bitmap_empty").build(); private static final Map BITMAP_UNION_INT_SYMBOL = ImmutableMap.builder() diff --git a/fe/src/main/java/org/apache/doris/planner/LoadScanNode.java b/fe/src/main/java/org/apache/doris/planner/LoadScanNode.java index 4caec0fa59..b2dcc89bcd 100644 --- a/fe/src/main/java/org/apache/doris/planner/LoadScanNode.java +++ b/fe/src/main/java/org/apache/doris/planner/LoadScanNode.java @@ -83,16 +83,14 @@ public abstract class LoadScanNode extends ScanNode { isCompatible = false; } else { FunctionCallExpr fn = (FunctionCallExpr) expr; - if (!fn.getFnName().getFunction().equalsIgnoreCase(FunctionSet.TO_BITMAP) - && !fn.getFnName().getFunction().equalsIgnoreCase(FunctionSet.BITMAP_EMPTY)) { + String fnName = fn.getFnName().getFunction(); + if (!FunctionSet.BITMAP_LOAD_FNS.contains(fnName)) { isCompatible = false; } } } if (!isCompatible) { - throw new AnalysisException("bitmap column must use to_bitmap or empty_bitmap function, like " - + slotDesc.getColumn().getName() + "=to_bitmap(xxx)" - + slotDesc.getColumn().getName() + "=bitmap_empty()"); + throw new AnalysisException("bitmap column must use to_bitmap, bitmap_hash or empty_bitmap function"); } } diff --git a/gensrc/script/doris_builtins_functions.py b/gensrc/script/doris_builtins_functions.py index 9e262dadc0..fe6d0e6498 100755 --- a/gensrc/script/doris_builtins_functions.py +++ b/gensrc/script/doris_builtins_functions.py @@ -602,6 +602,8 @@ visible_functions = [ [['to_bitmap'], 'VARCHAR', ['VARCHAR'], '_ZN5doris15BitmapFunctions9to_bitmapEPN9doris_udf15FunctionContextERKNS1_9StringValE'], + [['bitmap_hash'], 'VARCHAR', ['VARCHAR'], + '_ZN5doris15BitmapFunctions11bitmap_hashEPN9doris_udf15FunctionContextERKNS1_9StringValE'], [['bitmap_count'], 'BIGINT', ['VARCHAR'], '_ZN5doris15BitmapFunctions12bitmap_countEPN9doris_udf15FunctionContextERKNS1_9StringValE'], [['bitmap_empty'], 'VARCHAR', [],