For #4674 This is a udaf for approximate topn using Space-Saving algorithm. At present, we can only calculate the frequent items and their frequencies in a certain column, based on which we can implement similar topN functions supported by Kylin in the future. I have also added a test to calculate the accuracy of this algorithm. The following is a rough running result. The total amount of data is 1 million lines and follows the Zipfian distribution, where Element Cardinality represents the data cardinality, 20X, 50X.. The value representing space_expand_rate is 20,50, which is used to set the counter number in the space-saving algorithm ``` zf exponent = 0.5 Element cardinality 20X 50X 100X 1000 100% 100% 100% 10000 100% 100% 100% 100000 100% 100% 100% 500000 94% 98% 99% zf exponent = 0.6,1 Element cardinality 20X 50X 100X 1000 100% 100% 100% 10000 100% 100% 100% 100000 100% 100% 100% 500000 100% 100% 100% ```
48 lines
1.6 KiB
C++
48 lines
1.6 KiB
C++
// Licensed to the Apache Software Foundation (ASF) under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing,
|
|
// software distributed under the License is distributed on an
|
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
// KIND, either express or implied. See the License for the
|
|
// specific language governing permissions and limitations
|
|
// under the License.
|
|
|
|
#ifndef DORIS_BE_SRC_EXPRS_TOPN_FUNCTION_H
|
|
#define DORIS_BE_SRC_EXPRS_TOPN_FUNCTION_H
|
|
|
|
#include "udf/udf.h"
|
|
|
|
namespace doris {
|
|
|
|
class TopNFunctions {
|
|
public:
|
|
static void init();
|
|
|
|
static void topn_init(FunctionContext*, StringVal* dst);
|
|
|
|
template <typename T>
|
|
static void topn_update(FunctionContext*, const T& src, const IntVal& topn, StringVal* dst);
|
|
|
|
template <typename T>
|
|
static void topn_update(FunctionContext*, const T& src, const IntVal& topn, const IntVal& space_expand_rate,
|
|
StringVal* dst);
|
|
|
|
static void topn_merge(FunctionContext*,const StringVal& src, StringVal* dst);
|
|
|
|
static StringVal topn_serialize(FunctionContext* ctx, const StringVal& src);
|
|
|
|
static StringVal topn_finalize(FunctionContext*, const StringVal& src);
|
|
};
|
|
|
|
}
|
|
|
|
#endif //DORIS_BE_SRC_EXPRS_TOPN_FUNCTION_H
|