Files
doris/be/src/exprs/topn_function.h
Youngwb 650536d53e [Feature] Add Topn udaf (#4803)
For #4674 
This is a udaf for approximate topn using Space-Saving algorithm.  At present, we can only calculate
the frequent items and their frequencies in a certain column, based on which we can implement similar
topN functions supported by Kylin in the future. 

I have also added a test to calculate the accuracy of this algorithm. The following is a rough running result.
The total amount of data is 1 million lines and follows the Zipfian distribution, where Element Cardinality
represents the data cardinality, 20X, 50X.. The value representing space_expand_rate is 20,50, which is
used to set the counter number in the space-saving algorithm

```
zf exponent = 0.5
Element cardinality	        20X        50X          100X
               1000		100%	   100%         100%
               10000		100%	   100%		100%
	       100000		100%	   100%		100%
	       500000		 94%	    98%		 99%

zf exponent = 0.6,1
Element cardinality	        20X        50X          100X
		1000		100%	   100%         100%
		10000		100%	   100%		100%
		100000		100%	   100%		100%
		500000		100%	   100%		100%

```
2020-12-16 21:58:34 +08:00

48 lines
1.6 KiB
C++

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#ifndef DORIS_BE_SRC_EXPRS_TOPN_FUNCTION_H
#define DORIS_BE_SRC_EXPRS_TOPN_FUNCTION_H
#include "udf/udf.h"
namespace doris {
class TopNFunctions {
public:
static void init();
static void topn_init(FunctionContext*, StringVal* dst);
template <typename T>
static void topn_update(FunctionContext*, const T& src, const IntVal& topn, StringVal* dst);
template <typename T>
static void topn_update(FunctionContext*, const T& src, const IntVal& topn, const IntVal& space_expand_rate,
StringVal* dst);
static void topn_merge(FunctionContext*,const StringVal& src, StringVal* dst);
static StringVal topn_serialize(FunctionContext* ctx, const StringVal& src);
static StringVal topn_finalize(FunctionContext*, const StringVal& src);
};
}
#endif //DORIS_BE_SRC_EXPRS_TOPN_FUNCTION_H