For #4674 This is a udaf for approximate topn using Space-Saving algorithm. At present, we can only calculate the frequent items and their frequencies in a certain column, based on which we can implement similar topN functions supported by Kylin in the future. I have also added a test to calculate the accuracy of this algorithm. The following is a rough running result. The total amount of data is 1 million lines and follows the Zipfian distribution, where Element Cardinality represents the data cardinality, 20X, 50X.. The value representing space_expand_rate is 20,50, which is used to set the counter number in the space-saving algorithm ``` zf exponent = 0.5 Element cardinality 20X 50X 100X 1000 100% 100% 100% 10000 100% 100% 100% 100000 100% 100% 100% 500000 94% 98% 99% zf exponent = 0.6,1 Element cardinality 20X 50X 100X 1000 100% 100% 100% 10000 100% 100% 100% 100000 100% 100% 100% 500000 100% 100% 100% ```
120 lines
3.7 KiB
C++
120 lines
3.7 KiB
C++
// Licensed to the Apache Software Foundation (ASF) under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing,
|
|
// software distributed under the License is distributed on an
|
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
// KIND, either express or implied. See the License for the
|
|
// specific language governing permissions and limitations
|
|
// under the License.
|
|
|
|
#pragma once
|
|
|
|
#include <algorithm>
|
|
#include <cmath>
|
|
#include <random>
|
|
|
|
/** Refer to https://stackoverflow.com/questions/9983239/how-to-generate-zipf-distributed-numbers-efficiently
|
|
* Zipf-like random distribution.
|
|
*
|
|
* "Rejection-inversion to generate variates from monotone discrete
|
|
* distributions", Wolfgang Hörmann and Gerhard Derflinger
|
|
* ACM TOMACS 6.3 (1996): 169-184
|
|
*/
|
|
template<class IntType = unsigned long, class RealType = double>
|
|
class zipf_distribution
|
|
{
|
|
public:
|
|
typedef RealType input_type;
|
|
typedef IntType result_type;
|
|
|
|
static_assert(std::numeric_limits<IntType>::is_integer, "");
|
|
static_assert(!std::numeric_limits<RealType>::is_integer, "");
|
|
|
|
zipf_distribution(const IntType n=std::numeric_limits<IntType>::max(),
|
|
const RealType q=1.0)
|
|
: n(n)
|
|
, q(q)
|
|
, H_x1(H(1.5) - 1.0)
|
|
, H_n(H(n + 0.5))
|
|
, dist(H_x1, H_n)
|
|
{}
|
|
|
|
IntType operator()(std::mt19937& rng)
|
|
{
|
|
while (true) {
|
|
const RealType u = dist(rng);
|
|
const RealType x = H_inv(u);
|
|
const IntType k = clamp<IntType>(std::round(x), 1, n);
|
|
if (u >= H(k + 0.5) - h(k)) {
|
|
return k;
|
|
}
|
|
}
|
|
}
|
|
|
|
private:
|
|
/** Clamp x to [min, max]. */
|
|
template<typename T>
|
|
static constexpr T clamp(const T x, const T min, const T max)
|
|
{
|
|
return std::max(min, std::min(max, x));
|
|
}
|
|
|
|
/** exp(x) - 1 / x */
|
|
static double
|
|
expxm1bx(const double x)
|
|
{
|
|
return (std::abs(x) > epsilon)
|
|
? std::expm1(x) / x
|
|
: (1.0 + x/2.0 * (1.0 + x/3.0 * (1.0 + x/4.0)));
|
|
}
|
|
|
|
/** H(x) = log(x) if q == 1, (x^(1-q) - 1)/(1 - q) otherwise.
|
|
* H(x) is an integral of h(x).
|
|
*
|
|
* Note the numerator is one less than in the paper order to work with all
|
|
* positive q.
|
|
*/
|
|
const RealType H(const RealType x)
|
|
{
|
|
const RealType log_x = std::log(x);
|
|
return expxm1bx((1.0 - q) * log_x) * log_x;
|
|
}
|
|
|
|
/** log(1 + x) / x */
|
|
static RealType
|
|
log1pxbx(const RealType x)
|
|
{
|
|
return (std::abs(x) > epsilon)
|
|
? std::log1p(x) / x
|
|
: 1.0 - x * ((1/2.0) - x * ((1/3.0) - x * (1/4.0)));
|
|
}
|
|
|
|
/** The inverse function of H(x) */
|
|
const RealType H_inv(const RealType x)
|
|
{
|
|
const RealType t = std::max(-1.0, x * (1.0 - q));
|
|
return std::exp(log1pxbx(t) * x);
|
|
}
|
|
|
|
/** That hat function h(x) = 1 / (x ^ q) */
|
|
const RealType h(const RealType x)
|
|
{
|
|
return std::exp(-q * std::log(x));
|
|
}
|
|
|
|
static constexpr RealType epsilon = 1e-8;
|
|
|
|
IntType n; ///< Number of elements
|
|
RealType q; ///< Exponent
|
|
RealType H_x1; ///< H(x_1)
|
|
RealType H_n; ///< H(n)
|
|
std::uniform_real_distribution<RealType> dist; ///< [H(x_1), H(n)]
|
|
}; |