[Feature](add bitmap udaf) add the bitmap intersection and difference set for mixed calculation of udaf (#15588)

* Add the bitmap intersection and difference set for mixed calculation of udaf

Co-authored-by: zhangbinbin05 <zhangbinbin05@baidu.com>
This commit is contained in:
zhbinbin
2023-03-14 20:40:37 +08:00
committed by GitHub
parent 65f71d9e06
commit ff9e03e2bf
14 changed files with 655 additions and 6 deletions

View File

@ -0,0 +1,216 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <stack>
#include <string>
#include "util/bitmap_intersect.h"
namespace doris {
// Compute the intersection union difference set of two or more bitmaps
// Usage: orthogonal_bitmap_parse_calculate(bitmap_column, filter_column, input_string)
// Example: orthogonal_bitmap_expr_calculate(user_id, event, '(A|B)&(C-D)'), meaning find the intersection union difference set of user_id in all A/B/C/D 4 bitmaps
// Operation symbol:
// the operator '|' stands for union, the operator '&' stands for intersection, the operator '-' indicates the difference set, the operator '^' stands for xor
class BitmapExprCalculation : public BitmapIntersect<std::string> {
public:
BitmapExprCalculation() = default;
explicit BitmapExprCalculation(const char* src) { deserialize(src); }
void bitmap_calculation_init(std::string& input_str) {
_polish = reverse_polish(input_str);
std::string bitmap_key;
for (int i = 0; i < _polish.length(); i++) {
char c = _polish.at(i);
if (c != '&' && c != '|' && c != '^' && c != '-' && c != ' ' && c != '\\') {
bitmap_key += c;
} else if (i != 0 && _polish.at(i - 1) == '\\') {
bitmap_key += c;
} else if (c == '\\') {
continue;
} else {
if (bitmap_key.length() > 0) {
add_key(bitmap_key);
bitmap_key.clear();
}
}
}
if (bitmap_key.length() > 0) {
add_key(bitmap_key);
bitmap_key.clear();
}
}
BitmapValue bitmap_calculate() {
std::stack<BitmapValue> values;
std::string bitmap_key;
for (int i = 0; i < _polish.length(); i++) {
char c = _polish.at(i);
if (c == ' ') {
if (bitmap_key.length() > 0) {
values.push(_bitmaps[bitmap_key]);
bitmap_key.clear();
}
} else if (c != '&' && c != '|' && c != '^' && c != '-' && c != '\\') {
bitmap_key += c;
} else if (i != 0 && _polish.at(i - 1) == '\\') {
bitmap_key += c;
} else if (c == '\\') {
continue;
} else {
if (bitmap_key.length() > 0) {
values.push(_bitmaps[bitmap_key]);
bitmap_key.clear();
}
if (values.size() >= 2) {
BitmapValue op_a = values.top();
values.pop();
BitmapValue op_b = values.top();
values.pop();
BitmapValue cal_result;
bitmap_calculate(op_a, op_b, c, cal_result);
values.push(cal_result);
}
}
}
BitmapValue result;
if (bitmap_key.length() > 0) {
result |= _bitmaps[bitmap_key];
} else if (!values.empty()) {
result |= values.top();
}
return result;
}
// calculate the bitmap value by expr bitmap calculate
int64_t bitmap_calculate_count() {
if (_bitmaps.empty()) {
return 0;
}
return bitmap_calculate().cardinality();
}
private:
constexpr int priority(char c) {
switch (c) {
case '&':
return 1;
case '|':
return 1;
case '^':
return 1;
case '-':
return 1;
default:
return 0;
}
}
template <class T>
std::string print_stack(std::stack<T>& stack) {
std::string result;
while (!stack.empty()) {
result = stack.top() + result;
stack.pop();
}
return result;
}
std::string reverse_polish(const std::string& input_str) {
std::stack<char> polish;
std::stack<char> op_stack;
bool last_is_char = false;
for (int i = 0; i < input_str.length(); i++) {
char cur_char = input_str.at(i);
if (cur_char != '&' && cur_char != '|' && cur_char != '^' && cur_char != '-' &&
cur_char != '(' && cur_char != ')' && cur_char != ' ' && cur_char != '\t') {
if (!last_is_char) {
polish.push(' ');
}
polish.push(cur_char);
last_is_char = true;
continue;
} else if (i != 0 && input_str.at(i - 1) == '\\') {
polish.push(cur_char);
last_is_char = true;
continue;
} else if (cur_char == ' ' || cur_char == '\t') {
last_is_char = false;
continue;
} else if (cur_char == '(') {
op_stack.push(cur_char);
} else if (!op_stack.empty() && cur_char == ')') {
while (!op_stack.empty() && op_stack.top() != '(') {
polish.push(op_stack.top());
op_stack.pop();
}
op_stack.pop();
} else {
if (!op_stack.empty() && op_stack.top() == '(') {
op_stack.push(cur_char);
} else {
if (!op_stack.empty() && priority(cur_char) > priority(op_stack.top())) {
op_stack.push(cur_char);
} else {
while (!op_stack.empty()) {
if (op_stack.top() == '(') {
break;
}
if (priority(cur_char) <= priority(op_stack.top())) {
polish.push(op_stack.top());
op_stack.pop();
} else {
break;
}
}
op_stack.push(cur_char);
}
}
}
last_is_char = false;
}
while (!op_stack.empty()) {
polish.push(op_stack.top());
op_stack.pop();
}
return print_stack(polish);
}
void bitmap_calculate(BitmapValue& op_a, BitmapValue& op_b, char op, BitmapValue& result) {
result |= op_b;
switch (op) {
case '&':
result &= op_a;
break;
case '|':
result |= op_a;
break;
case '-':
result -= op_a;
break;
case '^':
result ^= op_a;
break;
}
}
std::string _polish;
};
} // namespace doris

View File

@ -148,7 +148,6 @@ void Helper::read_from<std::string>(const char** src, std::string* result) {
*src += length;
}
// read_from end
} // namespace detail
// Calculate the intersection of two or more bitmaps
@ -238,7 +237,7 @@ public:
}
}
private:
protected:
std::map<T, BitmapValue> _bitmaps;
};
@ -325,7 +324,7 @@ public:
}
}
private:
protected:
phmap::flat_hash_map<std::string, BitmapValue> _bitmaps;
};

View File

@ -70,6 +70,18 @@ AggregateFunctionPtr create_aggregate_function_orthogonal_bitmap_intersect_count
result_is_nullable);
}
AggregateFunctionPtr create_aggregate_function_orthogonal_bitmap_expr_calculate(
const std::string& name, const DataTypes& argument_types, bool result_is_nullable) {
return create_aggregate_function_orthogonal<AggOrthBitMapExprCal>(name, argument_types,
result_is_nullable);
}
AggregateFunctionPtr create_aggregate_function_orthogonal_bitmap_expr_calculate_count(
const std::string& name, const DataTypes& argument_types, bool result_is_nullable) {
return create_aggregate_function_orthogonal<AggOrthBitMapExprCalCount>(name, argument_types,
result_is_nullable);
}
AggregateFunctionPtr create_aggregate_function_intersect_count(const std::string& name,
const DataTypes& argument_types,
@ -92,5 +104,11 @@ void register_aggregate_function_orthogonal_bitmap(AggregateFunctionSimpleFactor
factory.register_function_both("orthogonal_bitmap_union_count",
create_aggregate_function_orthogonal_bitmap_union_count);
factory.register_function_both("intersect_count", create_aggregate_function_intersect_count);
factory.register_function_both("orthogonal_bitmap_expr_calculate",
create_aggregate_function_orthogonal_bitmap_expr_calculate);
factory.register_function_both(
"orthogonal_bitmap_expr_calculate_count",
create_aggregate_function_orthogonal_bitmap_expr_calculate_count);
}
} // namespace doris::vectorized
} // namespace doris::vectorized

View File

@ -17,6 +17,7 @@
#pragma once
#include "util/bitmap_expr_calculation.h"
#include "util/bitmap_intersect.h"
#include "util/bitmap_value.h"
#include "vec/aggregate_functions/aggregate_function.h"
@ -177,6 +178,102 @@ private:
Int64 result = 0;
};
template <typename T>
struct AggOrthBitmapExprCalBaseData {
public:
using ColVecData = std::conditional_t<IsNumber<T>, ColumnVector<T>, ColumnString>;
void add(const IColumn** columns, size_t row_num) {
const auto& bitmap_col = static_cast<const ColumnBitmap&>(*columns[0]);
const auto& data_col = static_cast<const ColVecData&>(*columns[1]);
const auto& bitmap_value = bitmap_col.get_element(row_num);
std::string update_key = data_col.get_data_at(row_num).to_string();
bitmap_expr_cal.update(update_key, bitmap_value);
}
void init_add_key(const IColumn** columns, size_t row_num, int argument_size) {
if (first_init) {
DCHECK(argument_size > 1);
const auto& col = static_cast<const ColVecData&>(*columns[2]);
std::string expr = col.get_data_at(row_num).to_string();
bitmap_expr_cal.bitmap_calculation_init(expr);
first_init = false;
}
}
protected:
doris::BitmapExprCalculation bitmap_expr_cal;
bool first_init = true;
};
template <typename T>
struct AggOrthBitMapExprCal : public AggOrthBitmapExprCalBaseData<T> {
public:
static constexpr auto name = "orthogonal_bitmap_expr_calculate";
static DataTypePtr get_return_type() { return std::make_shared<DataTypeBitMap>(); }
void merge(const AggOrthBitMapExprCal& rhs) {
if (rhs.first_init) {
return;
}
result |= rhs.result;
}
void write(BufferWritable& buf) {
write_binary(AggOrthBitmapExprCalBaseData<T>::first_init, buf);
result = AggOrthBitmapExprCalBaseData<T>::bitmap_expr_cal.bitmap_calculate();
DataTypeBitMap::serialize_as_stream(result, buf);
}
void read(BufferReadable& buf) {
read_binary(AggOrthBitmapExprCalBaseData<T>::first_init, buf);
DataTypeBitMap::deserialize_as_stream(result, buf);
}
void get(IColumn& to) const {
auto& column = static_cast<ColumnBitmap&>(to);
column.get_data().emplace_back(result);
}
private:
BitmapValue result;
};
template <typename T>
struct AggOrthBitMapExprCalCount : public AggOrthBitmapExprCalBaseData<T> {
public:
static constexpr auto name = "orthogonal_bitmap_expr_calculate_count";
static DataTypePtr get_return_type() { return std::make_shared<DataTypeInt64>(); }
void merge(const AggOrthBitMapExprCalCount& rhs) {
if (rhs.first_init) {
return;
}
result += rhs.result;
}
void write(BufferWritable& buf) {
write_binary(AggOrthBitmapExprCalBaseData<T>::first_init, buf);
result = AggOrthBitmapExprCalBaseData<T>::bitmap_expr_cal.bitmap_calculate_count();
write_binary(result, buf);
}
void read(BufferReadable& buf) {
read_binary(AggOrthBitmapExprCalBaseData<T>::first_init, buf);
read_binary(result, buf);
}
void get(IColumn& to) const {
auto& column = static_cast<ColumnVector<Int64>&>(to);
column.get_data().emplace_back(result);
}
private:
int64_t result = 0;
};
template <typename T>
struct OrthBitmapUnionCountData {
static constexpr auto name = "orthogonal_bitmap_union_count";
@ -247,4 +344,4 @@ public:
private:
int _argument_size;
};
} // namespace doris::vectorized
} // namespace doris::vectorized

View File

@ -141,6 +141,35 @@ Explain:
on the basis of this table schema, this function is divided into two layers. In the first layer, be nodes (update and serialize) merge all the bitmaps, and then count the resulting bitmaps. The count values are serialized and sent to the second level be nodes (merge and finalize). In the second layer, the be nodes are used to calculate the sum of all the count values from the first level nodes
#### orthogonal_bitmap_expr_calculate
Compute the function by computing the intersection, union and difference set of the expression bitmap.
Syntax:
orthogonal_bitmap_expr_calculate(bitmap_column, filter_column, input_string)
Parameters:
the first parameter is the Bitmap column, the second parameter is the dimension column used for filtering, that is, the calculated key column, and the third parameter is the calculation expression string, which means that the bitmap intersection and union difference expression is calculated according to the key column
the calculators supported by the expression: & represents intersection calculation, | represents union calculation, - represents difference calculation, ^ represents XOR calculation, and \ represents escape characters
Explain:
the aggregation of query planning is divided into two layers. The first layer of be aggregation node calculation includes init, update, and serialize steps. The second layer of be aggregation node calculation includes merge and finalize steps. In the first layer of be node, the input string is parsed in the init phase, which is converted into a suffix expression (inverse Polish), parses the calculated key value, and initializes it in the map<key, bitmap>structure; In the update phase, the underlying kernel scan dimension column (filter_column) calls back the update function, and then aggregates the bitmap in the map structure of the previous step in the unit of computing key; In the serialize stage, the bitmap of the key column is parsed according to the suffix expression, and the bitmap intersection, merge and difference set is calculated using the first in, last out principle of the stack structure. Then the final bitmap is serialized and sent to the aggregation be node in the second layer. Aggregates be nodes in the second layer, finds the union set of all bitmap values from the first layer nodes, and returns the final bitmap results
#### orthogonal_bitmap_expr_calculate_count
Compute the count function by computing the intersection, union and difference set of the expression bitmap. The syntax and parameters is the same as orthogonal_bitmap_expr_calculate
Syntax:
orthogonal_bitmap_expr_calculate_count(bitmap_column, filter_column, input_string)
Explain:
the aggregation of query planning is divided into two layers. The first layer of be aggregation node calculation includes init, update, and serialize steps. The second layer of be aggregation node calculation includes merge and finalize steps. In the first layer of be node, the input string is parsed in the init phase, converted to suffix expression Formula (inverse Polish formula), parse the calculated key value and initialize it in the map<key, bitmap>structure; In the update phase, the underlying kernel scan dimension column (filter_column) calls back the update function, and then aggregates the bitmap in the map structure of the previous step in the unit of computing key; In the serialize stage, the bitmap of the key column is parsed according to the suffix expression, and the bitmap intersection, merge and difference set is calculated using the first in, last out principle of the stack structure. Then the count value of the final bitmap is serialized and sent to the aggregation be node in the second layer.> Aggregates be nodes in the second layer, adds and sums all count values from the first layer nodes, and returns the final count result.
### Suitable for the scene
@ -159,3 +188,15 @@ Calculate the deduplication value for user_id:
```
select orthogonal_bitmap_union_count(user_id) from user_tag_bitmap where tag in (13080800, 11110200);
```
Bitmap cross merge difference set hybrid computing:
```
select orthogonal_bitmap_expr_calculate_count(user_id, tag, '(833736|999777)&(1308083|231207)&(1000|20000-30000)') from user_tag_bitmap where tag in (833736,999777,130808,231207,1000,20000,30000);
Note: 1000, 20000, 30000 plastic tags represent different labels of users
```
```
select orthogonal_bitmap_expr_calculate_count(user_id, tag, '(A:a/b|B:2\\-4)&(C:1-D:12)&E:23') from user_str_tag_bitmap where tag in ('A:a/b', 'B:2-4', 'C:1', 'D:12', 'E:23');
Note: 'A:a/b', 'B:2-4', etc. are string types tag, representing different labels of users, where 'B:2-4' needs to be escaped as'B:2\\-4'
```

View File

@ -0,0 +1,47 @@
---
{
"title": "orthogonal_bitmap_expr_calculate",
"language": "en"
}
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
## orthogonal_bitmap_expr_calculate
### description
#### Syntax
`BITMAP ORTHOGONAL_BITMAP_EXPR_CALCULATE(bitmap_column, column_to_filter, input_string)`
The first parameter is the Bitmap column, the second parameter is the dimension column used for filtering, that is, the calculated key column, and the third parameter is the calculation expression string, meaning that the bitmap intersection, union, and difference set expression is calculated according to the key column
The calculators supported by the expression:&represents intersection calculation, | represents union calculation, - represents difference calculation, ^ represents XOR calculation, and \ represents escape characters
### example
```
select orthogonal_bitmap_expr_calculate(user_id, tag, '(833736|999777)&(1308083|231207)&(1000|20000-30000)') from user_tag_bitmap where tag in (833736,999777,130808,231207,1000,20000,30000);
Note: 1000, 20000, 30000 plastic tags represent different labels of users
```
```
select orthogonal_bitmap_expr_calculate(user_id, tag, '(A:a/b|B:2\\-4)&(C:1-D:12)&E:23') from user_str_tag_bitmap where tag in ('A:a/b', 'B:2-4', 'C:1', 'D:12', 'E:23');
Note: 'A:a/b', 'B:2-4', etc. are string types tag, representing different labels of users, where 'B:2-4' needs to be escaped as'B:2\\-4'
```
### keywords
ORTHOGONAL_BITMAP_EXPR_CALCULATE,BITMAP

View File

@ -0,0 +1,47 @@
---
{
"title": "orthogonal_bitmap_expr_calculate_count",
"language": "en"
}
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
## orthogonal_bitmap_expr_calculate_count
### description
#### Syntax
`BITMAP ORTHOGONAL_BITMAP_EXPR_CALCULATE_COUNT(bitmap_column, column_to_filter, input_string)`
Calculate the bitmap intersection, union and difference set of expressions to calculate the count function. The first parameter is the Bitmap column, the second parameter is the dimension column used for filtering, that is, the calculated key column, and the third parameter is the calculation expression string, meaning that the bitmap intersection, union and difference set expression is calculated according to the key column
The calculators supported by the expression:&represents intersection calculation, | represents union calculation, - represents difference calculation, ^ represents XOR calculation, and \ represents escape characters
### example
```
select orthogonal_bitmap_expr_calculate_count(user_id, tag, '(833736|999777)&(1308083|231207)&(1000|20000-30000)') from user_tag_bitmap where tag in (833736,999777,130808,231207,1000,20000,30000);
Note: 1000, 20000, 30000 plastic tags represent different labels of users
```
```
select orthogonal_bitmap_expr_calculate_count(user_id, tag, '(A:a/b|B:2\\-4)&(C:1-D:12)&E:23') from user_str_tag_bitmap where tag in ('A:a/b', 'B:2-4', 'C:1', 'D:12', 'E:23');
Note: 'A:a/b', 'B:2-4', etc. are string types tag, representing different labels of users, where 'B:2-4' needs to be escaped as'B:2\\-4'
```
### keywords
ORTHOGONAL_BITMAP_EXPR_CALCULATE_COUNT,BITMAP

View File

@ -545,6 +545,8 @@
"sql-manual/sql-functions/bitmap-functions/bitmap_intersect",
"sql-manual/sql-functions/bitmap-functions/orthogonal_bitmap_intersect",
"sql-manual/sql-functions/bitmap-functions/orthogonal_bitmap_intersect_count",
"sql-manual/sql-functions/bitmap-functions/orthogonal_bitmap_expr_calculate",
"sql-manual/sql-functions/bitmap-functions/orthogonal_bitmap_expr_calculate_count",
"sql-manual/sql-functions/bitmap-functions/bitmap_hash64"
]
},

View File

@ -145,6 +145,36 @@ orthogonal_bitmap_union_count(bitmap_column)
查询规划上分2层,在第一层be节点(update、serialize)对所有bitmap求并集,再对并集的结果bitmap求count,count值序列化后发送至第二层be节点(merge、finalize),在第二层be节点对所有来源于第一层节点的count值循环求sum
#### orthogonal_bitmap_expr_calculate
求表达式bitmap交并差集合计算函数。
语法:
orthogonal_bitmap_expr_calculate(bitmap_column, filter_column, input_string)
参数:
第一个参数是Bitmap列,第二个参数是用来过滤的维度列,即计算的key列,第三个参数是计算表达式字符串,含义是依据key列进行bitmap交并差集表达式计算
表达式支持的计算符:& 代表交集计算,| 代表并集计算,- 代表差集计算, ^ 代表异或计算,\ 代表转义字符
说明:
查询规划上聚合分2层,第一层be聚合节点计算包括init、update、serialize步骤,第二层be聚合节点计算包括merge、finalize步骤。在第一层be节点,init阶段解析input_string字符串,转换为后缀表达式(逆波兰式),解析出计算key值,并在map<key, bitmap>结构中初始化;update阶段,底层内核scan维度列(filter_column)数据后回调update函数,然后以计算key为单位对上一步的map结构中的bitmap进行聚合;serialize阶段,根据后缀表达式,解析出计算key列的bitmap,利用栈结构先进后出原则,进行bitmap交并差集合计算,然后对最终的结果bitmap序列化后发送至第二层聚合be节点。在第二层聚合be节点,对所有来源于第一层节点的bitmap值求并集,并返回最终bitmap结果
#### orthogonal_bitmap_expr_calculate_count
求表达式bitmap交并差集合计算count函数, 语法和参数同orthogonal_bitmap_expr_calculate。
语法:
orthogonal_bitmap_expr_calculate_count(bitmap_column, filter_column, input_string)
说明:
查询规划上聚合分2层,第一层be聚合节点计算包括init、update、serialize步骤,第二层be聚合节点计算包括merge、finalize步骤。在第一层be节点,init阶段解析input_string字符串,转换为后缀表达式(逆波兰式),解析出计算key值,并在map<key, bitmap>结构中初始化;update阶段,底层内核scan维度列(filter_column)数据后回调update函数,然后以计算key为单位对上一步的map结构中的bitmap进行聚合;serialize阶段,根据后缀表达式,解析出计算key列的bitmap,利用栈结构先进后出原则,进行bitmap交并差集合计算,然后对最终的结果bitmap的count值序列化后发送至第二层聚合be节点。在第二层聚合be节点,对所有来源于第一层节点的count值求加和,并返回最终count结果。
### 使用场景
符合对bitmap进行正交计算的场景,如在用户行为分析中,计算留存,漏斗,用户画像等。
@ -161,3 +191,15 @@ orthogonal_bitmap_union_count(bitmap_column)
```sql
select orthogonal_bitmap_union_count(user_id) from user_tag_bitmap where tag in (13080800, 11110200);
```
bitmap交并差集合混合计算:
```sql
select orthogonal_bitmap_expr_calculate_count(user_id, tag, '(833736|999777)&(1308083|231207)&(1000|20000-30000)') from user_tag_bitmap where tag in (833736,999777,130808,231207,1000,20000,30000);
注:10002000030000等整形tag,代表用户不同标签
```
```sql
select orthogonal_bitmap_expr_calculate_count(user_id, tag, '(A:a/b|B:2\\-4)&(C:1-D:12)&E:23') from user_str_tag_bitmap where tag in ('A:a/b', 'B:2-4', 'C:1', 'D:12', 'E:23');
注:'A:a/b', 'B:2-4'等是字符串类型tag,代表用户不同标签, 其中'B:2-4'需要转义成'B:2\\-4'
```

View File

@ -0,0 +1,47 @@
---
{
"title": "orthogonal_bitmap_expr_calculate",
"language": "zh-CN"
}
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
## orthogonal_bitmap_expr_calculate
### description
#### Syntax
`BITMAP ORTHOGONAL_BITMAP_EXPR_CALCULATE(bitmap_column, column_to_filter, input_string)`
求表达式bitmap交并差集合计算函数, 第一个参数是Bitmap列,第二个参数是用来过滤的维度列,即计算的key列,第三个参数是计算表达式字符串,含义是依据key列进行bitmap交并差集表达式计算
表达式支持的计算符:& 代表交集计算,| 代表并集计算,- 代表差集计算, ^ 代表异或计算,\ 代表转义字符
### example
```sql
select orthogonal_bitmap_expr_calculate(user_id, tag, '(833736|999777)&(1308083|231207)&(1000|20000-30000)') from user_tag_bitmap where tag in (833736,999777,130808,231207,1000,20000,30000);
注:10002000030000等整形tag,代表用户不同标签
```
```sql
select orthogonal_bitmap_expr_calculate(user_id, tag, '(A:a/b|B:2\\-4)&(C:1-D:12)&E:23') from user_str_tag_bitmap where tag in ('A:a/b', 'B:2-4', 'C:1', 'D:12', 'E:23');
注:'A:a/b', 'B:2-4'等是字符串类型tag,代表用户不同标签, 其中'B:2-4'需要转义成'B:2\\-4'
```
### keywords
ORTHOGONAL_BITMAP_EXPR_CALCULATE,BITMAP

View File

@ -0,0 +1,47 @@
---
{
"title": "orthogonal_bitmap_expr_calculate_count",
"language": "zh-CN"
}
---
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
## orthogonal_bitmap_expr_calculate_count
### description
#### Syntax
`BITMAP ORTHOGONAL_BITMAP_EXPR_CALCULATE_COUNT(bitmap_column, column_to_filter, input_string)`
求表达式bitmap交并差集合计算count函数, 第一个参数是Bitmap列,第二个参数是用来过滤的维度列,即计算的key列,第三个参数是计算表达式字符串,含义是依据key列进行bitmap交并差集表达式计算
表达式支持的计算符:& 代表交集计算,| 代表并集计算,- 代表差集计算, ^ 代表异或计算,\ 代表转义字符
### example
```sql
select orthogonal_bitmap_expr_calculate_count(user_id, tag, '(833736|999777)&(1308083|231207)&(1000|20000-30000)') from user_tag_bitmap where tag in (833736,999777,130808,231207,1000,20000,30000);
注:10002000030000等整形tag,代表用户不同标签
```
```sql
select orthogonal_bitmap_expr_calculate_count(user_id, tag, '(A:a/b|B:2\\-4)&(C:1-D:12)&E:23') from user_str_tag_bitmap where tag in ('A:a/b', 'B:2-4', 'C:1', 'D:12', 'E:23');
注:'A:a/b', 'B:2-4'等是字符串类型tag,代表用户不同标签, 其中'B:2-4'需要转义成'B:2\\-4'
```
### keywords
ORTHOGONAL_BITMAP_EXPR_CALCULATE_COUNT,BITMAP

View File

@ -819,7 +819,9 @@ public class FunctionCallExpr extends Expr {
if (fnName.getFunction().equalsIgnoreCase(FunctionSet.INTERSECT_COUNT) || fnName.getFunction()
.equalsIgnoreCase(FunctionSet.ORTHOGONAL_BITMAP_INTERSECT) || fnName.getFunction()
.equalsIgnoreCase(FunctionSet.ORTHOGONAL_BITMAP_INTERSECT_COUNT)) {
.equalsIgnoreCase(FunctionSet.ORTHOGONAL_BITMAP_INTERSECT_COUNT) || fnName.getFunction()
.equalsIgnoreCase(FunctionSet.ORTHOGONAL_BITMAP_EXPR_CALCULATE_COUNT) || fnName.getFunction()
.equalsIgnoreCase(FunctionSet.ORTHOGONAL_BITMAP_EXPR_CALCULATE)) {
if (children.size() <= 2) {
throw new AnalysisException(fnName + "(bitmap_column, column_to_filter, filter_values) "
+ "function requires at least three parameters");

View File

@ -50,6 +50,7 @@ public class AggregateFunction extends Function {
"dense_rank", "multi_distinct_count", "multi_distinct_sum", FunctionSet.HLL_UNION_AGG,
FunctionSet.HLL_UNION, FunctionSet.BITMAP_UNION, FunctionSet.BITMAP_INTERSECT,
FunctionSet.ORTHOGONAL_BITMAP_INTERSECT, FunctionSet.ORTHOGONAL_BITMAP_INTERSECT_COUNT,
FunctionSet.ORTHOGONAL_BITMAP_EXPR_CALCULATE_COUNT, FunctionSet.ORTHOGONAL_BITMAP_EXPR_CALCULATE,
FunctionSet.INTERSECT_COUNT, FunctionSet.ORTHOGONAL_BITMAP_UNION_COUNT,
FunctionSet.COUNT, "approx_count_distinct", "ndv", FunctionSet.BITMAP_UNION_INT,
FunctionSet.BITMAP_UNION_COUNT, "ndv_no_finalize", FunctionSet.WINDOW_FUNNEL, FunctionSet.RETENTION,

View File

@ -930,6 +930,8 @@ public class FunctionSet<T> {
public static final String ORTHOGONAL_BITMAP_UNION_COUNT = "orthogonal_bitmap_union_count";
public static final String APPROX_COUNT_DISTINCT = "approx_count_distinct";
public static final String NDV = "ndv";
public static final String ORTHOGONAL_BITMAP_EXPR_CALCULATE_COUNT = "orthogonal_bitmap_expr_calculate_count";
public static final String ORTHOGONAL_BITMAP_EXPR_CALCULATE = "orthogonal_bitmap_expr_calculate";
public static final String QUANTILE_UNION = "quantile_union";
//TODO(weixiang): is quantile_percent can be replaced by approx_percentile?
@ -2461,6 +2463,47 @@ public class FunctionSet<T> {
Lists.newArrayList(Type.BITMAP, t, t), Type.BIGINT, Type.BITMAP, true, "", "", "", "", "", "", "",
true, false, true, true));
}
Type[] ntypes = {Type.CHAR, Type.VARCHAR, Type.STRING};
for (Type t : ntypes) {
addBuiltin(AggregateFunction.createBuiltin(ORTHOGONAL_BITMAP_EXPR_CALCULATE,
Lists.newArrayList(Type.BITMAP, t, Type.STRING),
Type.BITMAP,
Type.VARCHAR,
true,
"_ZN5doris15BitmapFunctions37orthogonal_bitmap_expr_calculate_initEPN9doris_udf15FunctionContextEPNS1_9StringValE",
"_ZN5doris15BitmapFunctions39orthogonal_bitmap_expr_calculate_updateEPN9doris_udf15FunctionContextERKNS1_9StringValES6_iPS5_S7_",
"_ZN5doris15BitmapFunctions12bitmap_unionEPN9doris_udf15FunctionContextERKNS1_9StringValEPS4_",
"_ZN5doris15BitmapFunctions42orthogonal_bitmap_expr_calculate_serializeEPN9doris_udf15FunctionContextERKNS1_9StringValE",
"",
"",
"_ZN5doris15BitmapFunctions16bitmap_serializeEPN9doris_udf15FunctionContextERKNS1_9StringValE",
true, false, true));
addBuiltin(AggregateFunction.createBuiltin(ORTHOGONAL_BITMAP_EXPR_CALCULATE_COUNT,
Lists.newArrayList(Type.BITMAP, t, Type.STRING),
Type.BIGINT,
Type.VARCHAR,
true,
"_ZN5doris15BitmapFunctions43orthogonal_bitmap_expr_calculate_count_initEPN9doris_udf15FunctionContextEPNS1_9StringValE",
"_ZN5doris15BitmapFunctions39orthogonal_bitmap_expr_calculate_updateEPN9doris_udf15FunctionContextERKNS1_9StringValES6_iPS5_S7_",
"_ZN5doris15BitmapFunctions29orthogonal_bitmap_count_mergeEPN9doris_udf15FunctionContextERKNS1_9StringValEPS4_",
"_ZN5doris15BitmapFunctions48orthogonal_bitmap_expr_calculate_count_serializeEPN9doris_udf15FunctionContextERKNS1_9StringValE",
"",
"",
"_ZN5doris15BitmapFunctions32orthogonal_bitmap_count_finalizeEPN9doris_udf15FunctionContextERKNS1_9StringValE",
true, false, true));
//vec ORTHOGONAL_BITMAP_EXPR_CALCULATE and ORTHOGONAL_BITMAP_EXPR_CALCULATE_COUNT
addBuiltin(
AggregateFunction.createBuiltin(ORTHOGONAL_BITMAP_EXPR_CALCULATE, Lists.newArrayList(Type.BITMAP, t, Type.STRING),
Type.BITMAP, Type.BITMAP, true, "", "", "", "", "", "", "", true, false, true, true));
addBuiltin(AggregateFunction.createBuiltin(ORTHOGONAL_BITMAP_EXPR_CALCULATE_COUNT,
Lists.newArrayList(Type.BITMAP, t, Type.STRING), Type.BIGINT, Type.BITMAP, true, "", "", "", "", "", "", "",
true, false, true, true));
}
// bitmap
addBuiltin(AggregateFunction.createBuiltin(BITMAP_UNION, Lists.newArrayList(Type.BITMAP),
Type.BITMAP,