From 3f99dd5c4b86fce67d2f62ac05a3f4a05cc579f6 Mon Sep 17 00:00:00 2001 From: TengJianPing <18241664+jacktengg@users.noreply.github.com> Date: Tue, 27 Sep 2022 12:16:02 +0800 Subject: [PATCH] [function](bitmap) support bitmap_hash64 (#12992) --- be/src/exprs/bitmap_function.cpp | 10 ++++ be/src/exprs/bitmap_function.h | 1 + be/src/vec/functions/function_bitmap.cpp | 44 ++++++++++++--- .../aggregate-functions/bitmap_union.md | 2 +- .../bitmap-functions/bitmap_hash64.md | 52 ++++++++++++++++++ .../sql-reference/Data-Types/BITMAP.md | 4 +- .../aggregate-functions/bitmap_union.md | 2 +- .../bitmap-functions/bitmap_hash64.md | 52 ++++++++++++++++++ .../sql-reference/Data-Types/BITMAP.md | 4 +- gensrc/script/doris_builtins_functions.py | 6 ++ .../datatype_p0/bitmap/test_bitmap_int.out | Bin 239 -> 315 bytes .../bitmap_functions/test_bitmap_function.out | 15 ++++- .../datatype_p0/bitmap/test_bitmap_int.groovy | 15 +++++ .../test_bitmap_function.groovy | 11 +++- 14 files changed, 198 insertions(+), 20 deletions(-) create mode 100644 docs/en/docs/sql-manual/sql-functions/bitmap-functions/bitmap_hash64.md create mode 100644 docs/zh-CN/docs/sql-manual/sql-functions/bitmap-functions/bitmap_hash64.md diff --git a/be/src/exprs/bitmap_function.cpp b/be/src/exprs/bitmap_function.cpp index 4fa1383eec..d3747ca0af 100644 --- a/be/src/exprs/bitmap_function.cpp +++ b/be/src/exprs/bitmap_function.cpp @@ -169,6 +169,16 @@ StringVal BitmapFunctions::bitmap_hash(doris_udf::FunctionContext* ctx, } return serialize(ctx, &bitmap); } +StringVal BitmapFunctions::bitmap_hash64(doris_udf::FunctionContext* ctx, + const doris_udf::StringVal& src) { + BitmapValue bitmap; + if (!src.is_null) { + uint64_t hash_value = 0; + murmur_hash3_x64_64(src.ptr, src.len, 0, &hash_value); + bitmap.add(hash_value); + } + return serialize(ctx, &bitmap); +} StringVal BitmapFunctions::bitmap_serialize(FunctionContext* ctx, const StringVal& src) { if (src.is_null) { diff --git a/be/src/exprs/bitmap_function.h b/be/src/exprs/bitmap_function.h index 8b3db6b33e..0ace3589c8 100644 --- a/be/src/exprs/bitmap_function.h +++ b/be/src/exprs/bitmap_function.h @@ -68,6 +68,7 @@ public: static StringVal bitmap_serialize(FunctionContext* ctx, const StringVal& src); static StringVal to_bitmap(FunctionContext* ctx, const StringVal& src); static StringVal bitmap_hash(FunctionContext* ctx, const StringVal& src); + static StringVal bitmap_hash64(FunctionContext* ctx, const StringVal& src); static StringVal bitmap_or(FunctionContext* ctx, const StringVal& src, const StringVal& dst); static StringVal bitmap_xor(FunctionContext* ctx, const StringVal& src, const StringVal& dst); static StringVal bitmap_and(FunctionContext* ctx, const StringVal& src, const StringVal& dst); diff --git a/be/src/vec/functions/function_bitmap.cpp b/be/src/vec/functions/function_bitmap.cpp index b10974d876..397d70aed5 100644 --- a/be/src/vec/functions/function_bitmap.cpp +++ b/be/src/vec/functions/function_bitmap.cpp @@ -140,8 +140,22 @@ public: } }; -struct BitmapHash { +template +struct BitmapHashName {}; + +template <> +struct BitmapHashName<32> { static constexpr auto name = "bitmap_hash"; +}; + +template <> +struct BitmapHashName<64> { + static constexpr auto name = "bitmap_hash64"; +}; + +template +struct BitmapHash { + static constexpr auto name = BitmapHashName::name; using ReturnType = DataTypeBitMap; @@ -154,9 +168,15 @@ struct BitmapHash { for (size_t i = 0; i < size; ++i) { const char* raw_str = reinterpret_cast(&data[offsets[i - 1]]); size_t str_size = offsets[i] - offsets[i - 1]; - uint32_t hash_value = - HashUtil::murmur_hash3_32(raw_str, str_size, HashUtil::MURMUR3_32_SEED); - res_data[i].add(hash_value); + if constexpr (HashBits == 32) { + uint32_t hash_value = + HashUtil::murmur_hash3_32(raw_str, str_size, HashUtil::MURMUR3_32_SEED); + res_data[i].add(hash_value); + } else { + uint64_t hash_value = 0; + murmur_hash3_x64_64(raw_str, str_size, 0, &hash_value); + res_data[i].add(hash_value); + } } } @@ -173,9 +193,15 @@ struct BitmapHash { } else { const char* raw_str = reinterpret_cast(&data[offsets[i - 1]]); size_t str_size = offsets[i] - offsets[i - 1]; - uint32_t hash_value = - HashUtil::murmur_hash3_32(raw_str, str_size, HashUtil::MURMUR3_32_SEED); - res_data[i].add(hash_value); + if constexpr (HashBits == 32) { + uint32_t hash_value = + HashUtil::murmur_hash3_32(raw_str, str_size, HashUtil::MURMUR3_32_SEED); + res_data[i].add(hash_value); + } else { + uint64_t hash_value = 0; + murmur_hash3_x64_64(raw_str, str_size, 0, &hash_value); + res_data[i].add(hash_value); + } } } } @@ -511,7 +537,8 @@ public: using FunctionBitmapEmpty = FunctionConst; using FunctionToBitmap = FunctionAlwaysNotNullable; using FunctionBitmapFromString = FunctionBitmapAlwaysNull; -using FunctionBitmapHash = FunctionAlwaysNotNullable; +using FunctionBitmapHash = FunctionAlwaysNotNullable>; +using FunctionBitmapHash64 = FunctionAlwaysNotNullable>; using FunctionBitmapMin = FunctionBitmapSingle; using FunctionBitmapMax = FunctionBitmapSingle; @@ -539,6 +566,7 @@ void register_function_bitmap(SimpleFunctionFactory& factory) { factory.register_function(); factory.register_function(); factory.register_function(); + factory.register_function(); factory.register_function(); factory.register_function(); factory.register_function(); diff --git a/docs/en/docs/sql-manual/sql-functions/aggregate-functions/bitmap_union.md b/docs/en/docs/sql-manual/sql-functions/aggregate-functions/bitmap_union.md index f5f0f89049..8afe22c5be 100644 --- a/docs/en/docs/sql-manual/sql-functions/aggregate-functions/bitmap_union.md +++ b/docs/en/docs/sql-manual/sql-functions/aggregate-functions/bitmap_union.md @@ -57,7 +57,7 @@ ALTER TABLE pv_bitmap ADD ROLLUP pv (page, user_id); `BITMAP_EMPTY ()`: Generate empty bitmap columns, used for insert or import to fill the default value -`BITMAP_HASH (expr)`: Convert any type of column to a bitmap by hashing +`BITMAP_HASH (expr)` or `BITMAP_HASH64 (expr)`: Convert any type of column to a bitmap by hashing ##### Stream Load diff --git a/docs/en/docs/sql-manual/sql-functions/bitmap-functions/bitmap_hash64.md b/docs/en/docs/sql-manual/sql-functions/bitmap-functions/bitmap_hash64.md new file mode 100644 index 0000000000..e633df9b94 --- /dev/null +++ b/docs/en/docs/sql-manual/sql-functions/bitmap-functions/bitmap_hash64.md @@ -0,0 +1,52 @@ +--- +{ + "title": "bitmap_hash64", + "language": "en" +} +--- + + + +## bitmap_hash64 +### description +#### Syntax + +`BITMAP BITMAP_HASH64(expr)` + +Compute the 64-bits hash value of a expr of any type, then return a bitmap containing that hash value. Mainly be used to load non-integer value into bitmap column, e.g., + +``` +cat data | curl --location-trusted -u user:passwd -T - -H "columns: dt,page,device_id, device_id=bitmap_hash64(device_id)" http://host:8410/api/test/testDb/_stream_load +``` + +### example + +``` +mysql> select bitmap_count(bitmap_hash64('hello')); ++------------------------------------+ +| bitmap_count(bitmap_hash64('hello')) | ++------------------------------------+ +| 1 | ++------------------------------------+ +``` + +### keywords + + BITMAP_HASH,BITMAP diff --git a/docs/en/docs/sql-manual/sql-reference/Data-Types/BITMAP.md b/docs/en/docs/sql-manual/sql-reference/Data-Types/BITMAP.md index 24ae635e6b..c66b29424a 100644 --- a/docs/en/docs/sql-manual/sql-reference/Data-Types/BITMAP.md +++ b/docs/en/docs/sql-manual/sql-reference/Data-Types/BITMAP.md @@ -30,10 +30,10 @@ BITMAP BITMAP cannot be used as a key column, and the aggregation type is BITMAP_UNION when building the table. The user does not need to specify the length and default value. The length is controlled within the system according to the degree of data aggregation. -And the BITMAP column can only be queried or used by supporting functions such as bitmap_union_count, bitmap_union, and bitmap_hash. +And the BITMAP column can only be queried or used by supporting functions such as bitmap_union_count, bitmap_union, bitmap_hash and bitmap_hash64. The use of BITMAP in offline scenarios will affect the import speed. In the case of a large amount of data, the query speed will be slower than HLL and better than Count Distinct. -Note: If BITMAP does not use a global dictionary in real-time scenarios, using bitmap_hash() may cause an error of about one-thousandth. +Note: If BITMAP does not use a global dictionary in real-time scenarios, using bitmap_hash() may cause an error of about one-thousandth. If the error rate is not tolerable, bitmap_hash64 can be used instead. ### example diff --git a/docs/zh-CN/docs/sql-manual/sql-functions/aggregate-functions/bitmap_union.md b/docs/zh-CN/docs/sql-manual/sql-functions/aggregate-functions/bitmap_union.md index f4742b8d60..81b8c0c9be 100644 --- a/docs/zh-CN/docs/sql-manual/sql-functions/aggregate-functions/bitmap_union.md +++ b/docs/zh-CN/docs/sql-manual/sql-functions/aggregate-functions/bitmap_union.md @@ -56,7 +56,7 @@ ALTER TABLE pv_bitmap ADD ROLLUP pv (page, user_id); `BITMAP_EMPTY()`: 生成空 bitmap 列,用于 insert 或导入的时填充默认值 -`BITMAP_HASH(expr)`: 将任意类型的列通过 Hash 的方式转为 bitmap +`BITMAP_HASH(expr)`或者`BITMAP_HASH64(expr)`: 将任意类型的列通过 Hash 的方式转为 bitmap ##### Stream Load diff --git a/docs/zh-CN/docs/sql-manual/sql-functions/bitmap-functions/bitmap_hash64.md b/docs/zh-CN/docs/sql-manual/sql-functions/bitmap-functions/bitmap_hash64.md new file mode 100644 index 0000000000..38c6bf22b7 --- /dev/null +++ b/docs/zh-CN/docs/sql-manual/sql-functions/bitmap-functions/bitmap_hash64.md @@ -0,0 +1,52 @@ +--- +{ + "title": "bitmap_hash64", + "language": "zh-CN" +} +--- + + + +## bitmap_hash64 +### description +#### Syntax + +`BITMAP BITMAP_HASH64(expr)` + +对任意类型的输入计算64位的哈希值,返回包含该哈希值的bitmap。主要用于stream load任务将非整型字段导入Doris表的bitmap字段。例如 + +``` +cat data | curl --location-trusted -u user:passwd -T - -H "columns: dt,page,device_id, device_id=bitmap_hash64(device_id)" http://host:8410/api/test/testDb/_stream_load +``` + +### example + +``` +mysql> select bitmap_count(bitmap_hash64('hello')); ++------------------------------------+ +| bitmap_count(bitmap_hash64('hello')) | ++------------------------------------+ +| 1 | ++------------------------------------+ +``` + +### keywords + + BITMAP_HASH,BITMAP diff --git a/docs/zh-CN/docs/sql-manual/sql-reference/Data-Types/BITMAP.md b/docs/zh-CN/docs/sql-manual/sql-reference/Data-Types/BITMAP.md index acee6ca688..f469615ab1 100644 --- a/docs/zh-CN/docs/sql-manual/sql-reference/Data-Types/BITMAP.md +++ b/docs/zh-CN/docs/sql-manual/sql-reference/Data-Types/BITMAP.md @@ -29,10 +29,10 @@ under the License. BITMAP BITMAP不能作为key列使用,建表时配合聚合类型为BITMAP_UNION。 用户不需要指定长度和默认值。长度根据数据的聚合程度系统内控制。 - 并且BITMAP列只能通过配套的bitmap_union_count、bitmap_union、bitmap_hash等函数进行查询或使用。 + 并且BITMAP列只能通过配套的bitmap_union_count、bitmap_union、bitmap_hash、bitmap_hash64等函数进行查询或使用。 离线场景下使用BITMAP会影响导入速度,在数据量大的情况下查询速度会慢于HLL,并优于Count Distinct。 - 注意:实时场景下BITMAP如果不使用全局字典,使用了bitmap_hash()可能会导致有千分之一左右的误差。 + 注意:实时场景下BITMAP如果不使用全局字典,使用了bitmap_hash()可能会导致有千分之一左右的误差。如果这个误差不可接受,可以使用bitmap_hash64。 ### example diff --git a/gensrc/script/doris_builtins_functions.py b/gensrc/script/doris_builtins_functions.py index c15a15915c..61b612af08 100755 --- a/gensrc/script/doris_builtins_functions.py +++ b/gensrc/script/doris_builtins_functions.py @@ -2269,10 +2269,16 @@ visible_functions = [ [['bitmap_hash'], 'BITMAP', ['VARCHAR'], '_ZN5doris15BitmapFunctions11bitmap_hashEPN9doris_udf15FunctionContextERKNS1_9StringValE', '', '', 'vec', 'ALWAYS_NOT_NULLABLE'], + [['bitmap_hash64'], 'BITMAP', ['VARCHAR'], + '_ZN5doris15BitmapFunctions11bitmap_hash64EPN9doris_udf15FunctionContextERKNS1_9StringValE', + '', '', 'vec', 'ALWAYS_NOT_NULLABLE'], [['to_bitmap'], 'BITMAP', ['STRING'], '_ZN5doris15BitmapFunctions9to_bitmapEPN9doris_udf15FunctionContextERKNS1_9StringValE', '', '', 'vec', 'ALWAYS_NOT_NULLABLE'], [['bitmap_hash'], 'BITMAP', ['STRING'], + '_ZN5doris15BitmapFunctions11bitmap_hash64EPN9doris_udf15FunctionContextERKNS1_9StringValE', + '', '', 'vec', 'ALWAYS_NOT_NULLABLE'], + [['bitmap_hash64'], 'BITMAP', ['STRING'], '_ZN5doris15BitmapFunctions11bitmap_hashEPN9doris_udf15FunctionContextERKNS1_9StringValE', '', '', 'vec', 'ALWAYS_NOT_NULLABLE'], [['bitmap_count'], 'BIGINT', ['BITMAP'], diff --git a/regression-test/data/datatype_p0/bitmap/test_bitmap_int.out b/regression-test/data/datatype_p0/bitmap/test_bitmap_int.out index a814d07eaf66c2ceefd8410f6409593570300480..a8066c8edaf229cfcdc7746ed6e38f5af2ef91b4 100644 GIT binary patch delta 83 zcmaFQxSMIhdmmk01;ygR95a)6Lj_%3E>kWpgoqJH#E{d7%ZSs6%b3#;S