[enhancement](histogram) optimise aggregate function histogram (#15317)
This pr mainly to optimize the histogram(👉🏻 https://github.com/apache/doris/pull/14910) aggregation function. Including the following: 1. Support input parameters `sample_rate` and `max_bucket_num` 2. Add UT and regression test 3. Add documentation 4. Optimize function implementation logic Parameter description: - `sample_rate`:Optional. The proportion of sample data used to generate the histogram. The default is 0.2. - `max_bucket_num`:Optional. Limit the number of histogram buckets. The default value is 128. --- Example: ``` MySQL [test]> SELECT histogram(c_float) FROM histogram_test; +-------------------------------------------------------------------------------------------------------------------------------------+ | histogram(`c_float`) | +-------------------------------------------------------------------------------------------------------------------------------------+ | {"sample_rate":0.2,"max_bucket_num":128,"bucket_num":3,"buckets":[{"lower":"0.1","upper":"0.1","count":1,"pre_sum":0,"ndv":1},...]} | +-------------------------------------------------------------------------------------------------------------------------------------+ MySQL [test]> SELECT histogram(c_string, 0.5, 2) FROM histogram_test; +-------------------------------------------------------------------------------------------------------------------------------------+ | histogram(`c_string`) | +-------------------------------------------------------------------------------------------------------------------------------------+ | {"sample_rate":0.5,"max_bucket_num":2,"bucket_num":2,"buckets":[{"lower":"str1","upper":"str7","count":4,"pre_sum":0,"ndv":3},...]} | +-------------------------------------------------------------------------------------------------------------------------------------+ ``` Query result description: ``` { "sample_rate": 0.2, "max_bucket_num": 128, "bucket_num": 3, "buckets": [ { "lower": "0.1", "upper": "0.2", "count": 2, "pre_sum": 0, "ndv": 2 }, { "lower": "0.8", "upper": "0.9", "count": 2, "pre_sum": 2, "ndv": 2 }, { "lower": "1.0", "upper": "1.0", "count": 2, "pre_sum": 4, "ndv": 1 } ] } ``` Field description: - sample_rate:Rate of sampling - max_bucket_num:Limit the maximum number of buckets - bucket_num:The actual number of buckets - buckets:All buckets - lower:Upper bound of the bucket - upper:Lower bound of the bucket - count:The number of elements contained in the bucket - pre_sum:The total number of elements in the front bucket - ndv:The number of different values in the bucket > Total number of histogram elements = number of elements in the last bucket(count) + total number of elements in the previous bucket(pre_sum).
This commit is contained in:
@ -17,12 +17,24 @@
|
||||
|
||||
#include "vec/aggregate_functions/aggregate_function_histogram.h"
|
||||
|
||||
#include "vec/aggregate_functions/helpers.h"
|
||||
#include "vec/core/types.h"
|
||||
|
||||
namespace doris::vectorized {
|
||||
|
||||
template <typename T>
|
||||
AggregateFunctionPtr create_agg_function_histogram(const DataTypes& argument_types) {
|
||||
return AggregateFunctionPtr(
|
||||
new AggregateFunctionHistogram<AggregateFunctionHistogramData<T>, T>(argument_types));
|
||||
bool has_input_param = (argument_types.size() == 3);
|
||||
|
||||
if (has_input_param) {
|
||||
return AggregateFunctionPtr(
|
||||
new AggregateFunctionHistogram<AggregateFunctionHistogramData<T>, T, true>(
|
||||
argument_types));
|
||||
} else {
|
||||
return AggregateFunctionPtr(
|
||||
new AggregateFunctionHistogram<AggregateFunctionHistogramData<T>, T, false>(
|
||||
argument_types));
|
||||
}
|
||||
}
|
||||
|
||||
AggregateFunctionPtr create_aggregate_function_histogram(const std::string& name,
|
||||
@ -31,34 +43,37 @@ AggregateFunctionPtr create_aggregate_function_histogram(const std::string& name
|
||||
const bool result_is_nullable) {
|
||||
WhichDataType type(argument_types[0]);
|
||||
|
||||
if (type.is_uint8()) {
|
||||
return create_agg_function_histogram<UInt8>(argument_types);
|
||||
} else if (type.is_int8()) {
|
||||
return create_agg_function_histogram<Int8>(argument_types);
|
||||
} else if (type.is_int16()) {
|
||||
return create_agg_function_histogram<Int16>(argument_types);
|
||||
} else if (type.is_int32()) {
|
||||
return create_agg_function_histogram<Int32>(argument_types);
|
||||
} else if (type.is_int64()) {
|
||||
LOG(INFO) << fmt::format("supported input type {} for aggregate function {}",
|
||||
argument_types[0]->get_name(), name);
|
||||
|
||||
#define DISPATCH(TYPE) \
|
||||
if (type.idx == TypeIndex::TYPE) return create_agg_function_histogram<TYPE>(argument_types);
|
||||
FOR_NUMERIC_TYPES(DISPATCH)
|
||||
#undef DISPATCH
|
||||
|
||||
if (type.idx == TypeIndex::String) {
|
||||
return create_agg_function_histogram<String>(argument_types);
|
||||
}
|
||||
if (type.idx == TypeIndex::DateTime || type.idx == TypeIndex::Date) {
|
||||
return create_agg_function_histogram<Int64>(argument_types);
|
||||
} else if (type.is_int128()) {
|
||||
return create_agg_function_histogram<Int128>(argument_types);
|
||||
} else if (type.is_float32()) {
|
||||
return create_agg_function_histogram<Float32>(argument_types);
|
||||
} else if (type.is_float64()) {
|
||||
return create_agg_function_histogram<Float64>(argument_types);
|
||||
} else if (type.is_decimal32()) {
|
||||
}
|
||||
if (type.idx == TypeIndex::DateV2) {
|
||||
return create_agg_function_histogram<UInt32>(argument_types);
|
||||
}
|
||||
if (type.idx == TypeIndex::DateTimeV2) {
|
||||
return create_agg_function_histogram<UInt64>(argument_types);
|
||||
}
|
||||
if (type.idx == TypeIndex::Decimal32) {
|
||||
return create_agg_function_histogram<Decimal32>(argument_types);
|
||||
} else if (type.is_decimal64()) {
|
||||
}
|
||||
if (type.idx == TypeIndex::Decimal64) {
|
||||
return create_agg_function_histogram<Decimal64>(argument_types);
|
||||
} else if (type.is_decimal128()) {
|
||||
}
|
||||
if (type.idx == TypeIndex::Decimal128) {
|
||||
return create_agg_function_histogram<Decimal128>(argument_types);
|
||||
} else if (type.is_date()) {
|
||||
return create_agg_function_histogram<Int64>(argument_types);
|
||||
} else if (type.is_date_time()) {
|
||||
return create_agg_function_histogram<Int64>(argument_types);
|
||||
} else if (type.is_string()) {
|
||||
return create_agg_function_histogram<StringRef>(argument_types);
|
||||
}
|
||||
if (type.idx == TypeIndex::Decimal128I) {
|
||||
return create_agg_function_histogram<Decimal128I>(argument_types);
|
||||
}
|
||||
|
||||
LOG(WARNING) << fmt::format("unsupported input type {} for aggregate function {}",
|
||||
|
||||
Reference in New Issue
Block a user