[SQL][Function] Add approx_count_distinct() function (#4221)
Add approx_count_distinct() function to replace the ndv() function
This commit is contained in:
@ -267,7 +267,7 @@ module.exports = [
|
||||
"hll_union_agg",
|
||||
"max",
|
||||
"min",
|
||||
"ndv",
|
||||
"approx_count_distinct",
|
||||
"percentile_approx",
|
||||
"stddev",
|
||||
"stddev_samp",
|
||||
|
||||
@ -281,7 +281,7 @@ module.exports = [
|
||||
"hll_union_agg",
|
||||
"max",
|
||||
"min",
|
||||
"ndv",
|
||||
"approx_count_distinct",
|
||||
"percentile_approx",
|
||||
"stddev",
|
||||
"stddev_samp",
|
||||
|
||||
@ -226,7 +226,7 @@ Of course, the function of aggregated data is indispensable for general polymer
|
||||
|
||||
The following are some types of aggregated queries that can hit Rollup.
|
||||
|
||||
| Column type Query type | Sum | Distinct/Count Distinct | Min | Max | Ndv |
|
||||
| Column type Query type | Sum | Distinct/Count Distinct | Min | Max | APPROX_COUNT_DISTINCT |
|
||||
|--------------|-------|-------------------------|-------|-------|-------|
|
||||
| Key | false | true | true | true | true |
|
||||
| Value(Sum) | true | false | false | false | false |
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
---
|
||||
{
|
||||
"title": "NDV",
|
||||
"title": "APPROX_COUNT_DISTINCT",
|
||||
"language": "en"
|
||||
}
|
||||
---
|
||||
@ -24,11 +24,11 @@ specific language governing permissions and limitations
|
||||
under the License.
|
||||
-->
|
||||
|
||||
# NDV
|
||||
# APPROX_COUNT_DISTINCT
|
||||
## Description
|
||||
### Syntax
|
||||
|
||||
`NDV (expr)`
|
||||
`APPROX_COUNT_DISTINCT (expr)`
|
||||
|
||||
|
||||
Returns an approximate aggregation function similar to the result of COUNT (DISTINCT col).
|
||||
@ -37,12 +37,12 @@ It combines COUNT and DISTINCT faster and uses fixed-size memory, so less memory
|
||||
|
||||
## example
|
||||
```
|
||||
MySQL > select ndv(query_id) from log_statis group by datetime;
|
||||
MySQL > select approx_count_distinct(query_id) from log_statis group by datetime;
|
||||
+-----------------+
|
||||
| ndv(`query_id`) |
|
||||
| approx_count_distinct(`query_id`) |
|
||||
+-----------------+
|
||||
| 17721 |
|
||||
+-----------------+
|
||||
```
|
||||
##keyword
|
||||
NDV
|
||||
APPROX_COUNT_DISTINCT
|
||||
@ -72,7 +72,7 @@ distributed by hash(id) buckets 32;
|
||||
curl --location-trusted -uname:password -T data -H "label:load_1" -H "columns:dt, id, name, province, sex, cuid, os, set1=hll_hash(cuid), set2=hll_hash(os)"
|
||||
http://host/api/test_db/test/_stream_load
|
||||
|
||||
3. There are three common ways of aggregating data: (without aggregating the base table directly, the speed may be similar to that of using NDV directly)
|
||||
3. There are three common ways of aggregating data: (without aggregating the base table directly, the speed may be similar to that of using APPROX_COUNT_DISTINCT directly)
|
||||
|
||||
A. Create a rollup that allows HLL columns to generate aggregation.
|
||||
alter table test add rollup test_rollup(dt, set1);
|
||||
|
||||
@ -226,7 +226,7 @@ rollup_index4(k4, k6, k5, k1, k2, k3, k7)
|
||||
|
||||
以下是可以命中Rollup的一些聚合查询的种类,
|
||||
|
||||
| 列类型 查询类型 | Sum | Distinct/Count Distinct | Min | Max | Ndv |
|
||||
| 列类型 查询类型 | Sum | Distinct/Count Distinct | Min | Max | APPROX_COUNT_DISTINCT |
|
||||
|--------------|-------|-------------------------|-------|-------|-------|
|
||||
| Key | false | true | true | true | true |
|
||||
| Value(Sum) | true | false | false | false | false |
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
---
|
||||
{
|
||||
"title": "NDV",
|
||||
"title": "APPROX_COUNT_DISTINCT",
|
||||
"language": "zh-CN"
|
||||
}
|
||||
---
|
||||
@ -24,11 +24,11 @@ specific language governing permissions and limitations
|
||||
under the License.
|
||||
-->
|
||||
|
||||
# NDV
|
||||
# APPROX_COUNT_DISTINCT
|
||||
## description
|
||||
### Syntax
|
||||
|
||||
`NDV(expr)`
|
||||
`APPROX_COUNT_DISTINCT(expr)`
|
||||
|
||||
|
||||
返回类似于 COUNT(DISTINCT col) 结果的近似值聚合函数。
|
||||
@ -37,12 +37,12 @@ under the License.
|
||||
|
||||
## example
|
||||
```
|
||||
MySQL > select ndv(query_id) from log_statis group by datetime;
|
||||
MySQL > select approx_count_distinct(query_id) from log_statis group by datetime;
|
||||
+-----------------+
|
||||
| ndv(`query_id`) |
|
||||
| approx_count_distinct(`query_id`) |
|
||||
+-----------------+
|
||||
| 17721 |
|
||||
+-----------------+
|
||||
```
|
||||
##keyword
|
||||
NDV
|
||||
APPROX_COUNT_DISTINCT
|
||||
@ -69,7 +69,7 @@ under the License.
|
||||
curl --location-trusted -uname:password -T data -H "label:load_1" -H "columns:dt, id, name, province, sex, cuid, os, set1=hll_hash(cuid), set2=hll_hash(os)"
|
||||
http://host/api/test_db/test/_stream_load
|
||||
|
||||
3. 聚合数据,常用方式3种:(如果不聚合直接对base表查询,速度可能跟直接使用ndv速度差不多)
|
||||
3. 聚合数据,常用方式3种:(如果不聚合直接对base表查询,速度可能跟直接使用approx_count_distinct速度差不多)
|
||||
|
||||
a. 创建一个rollup,让hll列产生聚合,
|
||||
alter table test add rollup test_rollup(dt, set1);
|
||||
|
||||
@ -997,6 +997,17 @@ public class FunctionSet {
|
||||
"_ZN5doris12HllFunctions12hll_finalizeEPN9doris_udf15FunctionContextERKNS1_9StringValE",
|
||||
true, false, true));
|
||||
|
||||
//APPROX_COUNT_DISTINCT
|
||||
//alias of ndv, compute approx count distinct use HyperLogLog
|
||||
addBuiltin(AggregateFunction.createBuiltin("approx_count_distinct",
|
||||
Lists.newArrayList(t), Type.BIGINT, Type.VARCHAR,
|
||||
"_ZN5doris12HllFunctions8hll_initEPN9doris_udf15FunctionContextEPNS1_9StringValE",
|
||||
"_ZN5doris12HllFunctions" + HLL_UPDATE_SYMBOL.get(t),
|
||||
"_ZN5doris12HllFunctions9hll_mergeEPN9doris_udf15FunctionContextERKNS1_9StringValEPS4_",
|
||||
"_ZN5doris12HllFunctions13hll_serializeEPN9doris_udf15FunctionContextERKNS1_9StringValE",
|
||||
"_ZN5doris12HllFunctions12hll_finalizeEPN9doris_udf15FunctionContextERKNS1_9StringValE",
|
||||
true, false, true));
|
||||
|
||||
// BITMAP_UNION_INT
|
||||
addBuiltin(AggregateFunction.createBuiltin(BITMAP_UNION_INT,
|
||||
Lists.newArrayList(t), Type.BIGINT, Type.VARCHAR,
|
||||
|
||||
Reference in New Issue
Block a user