diff --git a/docs/.vuepress/sidebar/en.js b/docs/.vuepress/sidebar/en.js index 1b865b8c99..3fb8930200 100644 --- a/docs/.vuepress/sidebar/en.js +++ b/docs/.vuepress/sidebar/en.js @@ -267,7 +267,7 @@ module.exports = [ "hll_union_agg", "max", "min", - "ndv", + "approx_count_distinct", "percentile_approx", "stddev", "stddev_samp", diff --git a/docs/.vuepress/sidebar/zh-CN.js b/docs/.vuepress/sidebar/zh-CN.js index fcbd17d5b5..da69db3ec9 100644 --- a/docs/.vuepress/sidebar/zh-CN.js +++ b/docs/.vuepress/sidebar/zh-CN.js @@ -281,7 +281,7 @@ module.exports = [ "hll_union_agg", "max", "min", - "ndv", + "approx_count_distinct", "percentile_approx", "stddev", "stddev_samp", diff --git a/docs/en/getting-started/hit-the-rollup.md b/docs/en/getting-started/hit-the-rollup.md index 23b5642227..f8a0fa9dc3 100644 --- a/docs/en/getting-started/hit-the-rollup.md +++ b/docs/en/getting-started/hit-the-rollup.md @@ -226,7 +226,7 @@ Of course, the function of aggregated data is indispensable for general polymer The following are some types of aggregated queries that can hit Rollup. -| Column type Query type | Sum | Distinct/Count Distinct | Min | Max | Ndv | +| Column type Query type | Sum | Distinct/Count Distinct | Min | Max | APPROX_COUNT_DISTINCT | |--------------|-------|-------------------------|-------|-------|-------| | Key | false | true | true | true | true | | Value(Sum) | true | false | false | false | false | diff --git a/docs/en/sql-reference/sql-functions/aggregate-functions/ndv.md b/docs/en/sql-reference/sql-functions/aggregate-functions/approx_count_distinct.md similarity index 83% rename from docs/en/sql-reference/sql-functions/aggregate-functions/ndv.md rename to docs/en/sql-reference/sql-functions/aggregate-functions/approx_count_distinct.md index e8cdbc2df3..4682830cba 100644 --- a/docs/en/sql-reference/sql-functions/aggregate-functions/ndv.md +++ b/docs/en/sql-reference/sql-functions/aggregate-functions/approx_count_distinct.md @@ -1,6 +1,6 @@ --- { - "title": "NDV", + "title": "APPROX_COUNT_DISTINCT", "language": "en" } --- @@ -24,11 +24,11 @@ specific language governing permissions and limitations under the License. --> -# NDV +# APPROX_COUNT_DISTINCT ## Description ### Syntax -`NDV (expr)` +`APPROX_COUNT_DISTINCT (expr)` Returns an approximate aggregation function similar to the result of COUNT (DISTINCT col). @@ -37,12 +37,12 @@ It combines COUNT and DISTINCT faster and uses fixed-size memory, so less memory ## example ``` -MySQL > select ndv(query_id) from log_statis group by datetime; +MySQL > select approx_count_distinct(query_id) from log_statis group by datetime; +-----------------+ -| ndv(`query_id`) | +| approx_count_distinct(`query_id`) | +-----------------+ | 17721 | +-----------------+ ``` ##keyword -NDV +APPROX_COUNT_DISTINCT diff --git a/docs/en/sql-reference/sql-statements/Data Definition/HLL.md b/docs/en/sql-reference/sql-statements/Data Definition/HLL.md index f8d61a954d..4499b88bd1 100644 --- a/docs/en/sql-reference/sql-statements/Data Definition/HLL.md +++ b/docs/en/sql-reference/sql-statements/Data Definition/HLL.md @@ -72,7 +72,7 @@ distributed by hash(id) buckets 32; curl --location-trusted -uname:password -T data -H "label:load_1" -H "columns:dt, id, name, province, sex, cuid, os, set1=hll_hash(cuid), set2=hll_hash(os)" http://host/api/test_db/test/_stream_load -3. There are three common ways of aggregating data: (without aggregating the base table directly, the speed may be similar to that of using NDV directly) +3. There are three common ways of aggregating data: (without aggregating the base table directly, the speed may be similar to that of using APPROX_COUNT_DISTINCT directly) A. Create a rollup that allows HLL columns to generate aggregation. alter table test add rollup test_rollup(dt, set1); diff --git a/docs/zh-CN/getting-started/hit-the-rollup.md b/docs/zh-CN/getting-started/hit-the-rollup.md index 6f3dd6eca6..9ab8d5b20e 100644 --- a/docs/zh-CN/getting-started/hit-the-rollup.md +++ b/docs/zh-CN/getting-started/hit-the-rollup.md @@ -226,7 +226,7 @@ rollup_index4(k4, k6, k5, k1, k2, k3, k7) 以下是可以命中Rollup的一些聚合查询的种类, -| 列类型 查询类型 | Sum | Distinct/Count Distinct | Min | Max | Ndv | +| 列类型 查询类型 | Sum | Distinct/Count Distinct | Min | Max | APPROX_COUNT_DISTINCT | |--------------|-------|-------------------------|-------|-------|-------| | Key | false | true | true | true | true | | Value(Sum) | true | false | false | false | false | diff --git a/docs/zh-CN/sql-reference/sql-functions/aggregate-functions/ndv.md b/docs/zh-CN/sql-reference/sql-functions/aggregate-functions/approx_count_distinct.md similarity index 83% rename from docs/zh-CN/sql-reference/sql-functions/aggregate-functions/ndv.md rename to docs/zh-CN/sql-reference/sql-functions/aggregate-functions/approx_count_distinct.md index c2e857e98d..572e58e17d 100644 --- a/docs/zh-CN/sql-reference/sql-functions/aggregate-functions/ndv.md +++ b/docs/zh-CN/sql-reference/sql-functions/aggregate-functions/approx_count_distinct.md @@ -1,6 +1,6 @@ --- { - "title": "NDV", + "title": "APPROX_COUNT_DISTINCT", "language": "zh-CN" } --- @@ -24,11 +24,11 @@ specific language governing permissions and limitations under the License. --> -# NDV +# APPROX_COUNT_DISTINCT ## description ### Syntax -`NDV(expr)` +`APPROX_COUNT_DISTINCT(expr)` 返回类似于 COUNT(DISTINCT col) 结果的近似值聚合函数。 @@ -37,12 +37,12 @@ under the License. ## example ``` -MySQL > select ndv(query_id) from log_statis group by datetime; +MySQL > select approx_count_distinct(query_id) from log_statis group by datetime; +-----------------+ -| ndv(`query_id`) | +| approx_count_distinct(`query_id`) | +-----------------+ | 17721 | +-----------------+ ``` ##keyword -NDV +APPROX_COUNT_DISTINCT diff --git a/docs/zh-CN/sql-reference/sql-statements/Data Definition/HLL.md b/docs/zh-CN/sql-reference/sql-statements/Data Definition/HLL.md index dd1a108c4e..1d3f65ba97 100644 --- a/docs/zh-CN/sql-reference/sql-statements/Data Definition/HLL.md +++ b/docs/zh-CN/sql-reference/sql-statements/Data Definition/HLL.md @@ -69,7 +69,7 @@ under the License. curl --location-trusted -uname:password -T data -H "label:load_1" -H "columns:dt, id, name, province, sex, cuid, os, set1=hll_hash(cuid), set2=hll_hash(os)" http://host/api/test_db/test/_stream_load - 3. 聚合数据,常用方式3种:(如果不聚合直接对base表查询,速度可能跟直接使用ndv速度差不多) + 3. 聚合数据,常用方式3种:(如果不聚合直接对base表查询,速度可能跟直接使用approx_count_distinct速度差不多) a. 创建一个rollup,让hll列产生聚合, alter table test add rollup test_rollup(dt, set1); diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionSet.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionSet.java index d581ba4733..67436cf75e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionSet.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/FunctionSet.java @@ -997,6 +997,17 @@ public class FunctionSet { "_ZN5doris12HllFunctions12hll_finalizeEPN9doris_udf15FunctionContextERKNS1_9StringValE", true, false, true)); + //APPROX_COUNT_DISTINCT + //alias of ndv, compute approx count distinct use HyperLogLog + addBuiltin(AggregateFunction.createBuiltin("approx_count_distinct", + Lists.newArrayList(t), Type.BIGINT, Type.VARCHAR, + "_ZN5doris12HllFunctions8hll_initEPN9doris_udf15FunctionContextEPNS1_9StringValE", + "_ZN5doris12HllFunctions" + HLL_UPDATE_SYMBOL.get(t), + "_ZN5doris12HllFunctions9hll_mergeEPN9doris_udf15FunctionContextERKNS1_9StringValEPS4_", + "_ZN5doris12HllFunctions13hll_serializeEPN9doris_udf15FunctionContextERKNS1_9StringValE", + "_ZN5doris12HllFunctions12hll_finalizeEPN9doris_udf15FunctionContextERKNS1_9StringValE", + true, false, true)); + // BITMAP_UNION_INT addBuiltin(AggregateFunction.createBuiltin(BITMAP_UNION_INT, Lists.newArrayList(t), Type.BIGINT, Type.VARCHAR,