[enhancement](histogram) optimise aggregate function histogram (#15317)

This pr mainly to optimize the histogram(👉🏻 https://github.com/apache/doris/pull/14910)  aggregation function. Including the following:
1. Support input parameters `sample_rate` and `max_bucket_num`
2. Add UT and regression test
3. Add documentation
4. Optimize function implementation logic
 
Parameter description:
- `sample_rate`:Optional. The proportion of sample data used to generate the histogram. The default is 0.2.
- `max_bucket_num`:Optional. Limit the number of histogram buckets. The default value is 128.

---

Example:

```
MySQL [test]> SELECT histogram(c_float) FROM histogram_test;
+-------------------------------------------------------------------------------------------------------------------------------------+
| histogram(`c_float`)                                                                                                                |
+-------------------------------------------------------------------------------------------------------------------------------------+
| {"sample_rate":0.2,"max_bucket_num":128,"bucket_num":3,"buckets":[{"lower":"0.1","upper":"0.1","count":1,"pre_sum":0,"ndv":1},...]} |
+-------------------------------------------------------------------------------------------------------------------------------------+

MySQL [test]> SELECT histogram(c_string, 0.5, 2) FROM histogram_test;
+-------------------------------------------------------------------------------------------------------------------------------------+
| histogram(`c_string`)                                                                                                               |
+-------------------------------------------------------------------------------------------------------------------------------------+
| {"sample_rate":0.5,"max_bucket_num":2,"bucket_num":2,"buckets":[{"lower":"str1","upper":"str7","count":4,"pre_sum":0,"ndv":3},...]} |
+-------------------------------------------------------------------------------------------------------------------------------------+
```

Query result description:

```
{
    "sample_rate": 0.2, 
    "max_bucket_num": 128, 
    "bucket_num": 3, 
    "buckets": [
        {
            "lower": "0.1", 
            "upper": "0.2", 
            "count": 2, 
            "pre_sum": 0, 
            "ndv": 2
        }, 
        {
            "lower": "0.8", 
            "upper": "0.9", 
            "count": 2, 
            "pre_sum": 2, 
            "ndv": 2
        }, 
        {
            "lower": "1.0", 
            "upper": "1.0", 
            "count": 2, 
            "pre_sum": 4, 
            "ndv": 1
        }
    ]
}
```

Field description:
- sample_rate:Rate of sampling
- max_bucket_num:Limit the maximum number of buckets
- bucket_num:The actual number of buckets
- buckets:All buckets
    - lower:Upper bound of the bucket
    - upper:Lower bound of the bucket
    - count:The number of elements contained in the bucket
    - pre_sum:The total number of elements in the front bucket
    - ndv:The number of different values in the bucket

> Total number of histogram elements = number of elements in the last bucket(count) + total number of elements in the previous bucket(pre_sum).
This commit is contained in:
ElvinWei
2023-01-07 00:50:32 +08:00
committed by GitHub
parent 9c8fcd805c
commit 76ad599fd7
9 changed files with 877 additions and 388 deletions

View File

@ -17,12 +17,24 @@
#include "vec/aggregate_functions/aggregate_function_histogram.h"
#include "vec/aggregate_functions/helpers.h"
#include "vec/core/types.h"
namespace doris::vectorized {
template <typename T>
AggregateFunctionPtr create_agg_function_histogram(const DataTypes& argument_types) {
return AggregateFunctionPtr(
new AggregateFunctionHistogram<AggregateFunctionHistogramData<T>, T>(argument_types));
bool has_input_param = (argument_types.size() == 3);
if (has_input_param) {
return AggregateFunctionPtr(
new AggregateFunctionHistogram<AggregateFunctionHistogramData<T>, T, true>(
argument_types));
} else {
return AggregateFunctionPtr(
new AggregateFunctionHistogram<AggregateFunctionHistogramData<T>, T, false>(
argument_types));
}
}
AggregateFunctionPtr create_aggregate_function_histogram(const std::string& name,
@ -31,34 +43,37 @@ AggregateFunctionPtr create_aggregate_function_histogram(const std::string& name
const bool result_is_nullable) {
WhichDataType type(argument_types[0]);
if (type.is_uint8()) {
return create_agg_function_histogram<UInt8>(argument_types);
} else if (type.is_int8()) {
return create_agg_function_histogram<Int8>(argument_types);
} else if (type.is_int16()) {
return create_agg_function_histogram<Int16>(argument_types);
} else if (type.is_int32()) {
return create_agg_function_histogram<Int32>(argument_types);
} else if (type.is_int64()) {
LOG(INFO) << fmt::format("supported input type {} for aggregate function {}",
argument_types[0]->get_name(), name);
#define DISPATCH(TYPE) \
if (type.idx == TypeIndex::TYPE) return create_agg_function_histogram<TYPE>(argument_types);
FOR_NUMERIC_TYPES(DISPATCH)
#undef DISPATCH
if (type.idx == TypeIndex::String) {
return create_agg_function_histogram<String>(argument_types);
}
if (type.idx == TypeIndex::DateTime || type.idx == TypeIndex::Date) {
return create_agg_function_histogram<Int64>(argument_types);
} else if (type.is_int128()) {
return create_agg_function_histogram<Int128>(argument_types);
} else if (type.is_float32()) {
return create_agg_function_histogram<Float32>(argument_types);
} else if (type.is_float64()) {
return create_agg_function_histogram<Float64>(argument_types);
} else if (type.is_decimal32()) {
}
if (type.idx == TypeIndex::DateV2) {
return create_agg_function_histogram<UInt32>(argument_types);
}
if (type.idx == TypeIndex::DateTimeV2) {
return create_agg_function_histogram<UInt64>(argument_types);
}
if (type.idx == TypeIndex::Decimal32) {
return create_agg_function_histogram<Decimal32>(argument_types);
} else if (type.is_decimal64()) {
}
if (type.idx == TypeIndex::Decimal64) {
return create_agg_function_histogram<Decimal64>(argument_types);
} else if (type.is_decimal128()) {
}
if (type.idx == TypeIndex::Decimal128) {
return create_agg_function_histogram<Decimal128>(argument_types);
} else if (type.is_date()) {
return create_agg_function_histogram<Int64>(argument_types);
} else if (type.is_date_time()) {
return create_agg_function_histogram<Int64>(argument_types);
} else if (type.is_string()) {
return create_agg_function_histogram<StringRef>(argument_types);
}
if (type.idx == TypeIndex::Decimal128I) {
return create_agg_function_histogram<Decimal128I>(argument_types);
}
LOG(WARNING) << fmt::format("unsupported input type {} for aggregate function {}",