100 lines
3.6 KiB
C++
100 lines
3.6 KiB
C++
// Licensed to the Apache Software Foundation (ASF) under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing,
|
|
// software distributed under the License is distributed on an
|
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
// KIND, either express or implied. See the License for the
|
|
// specific language governing permissions and limitations
|
|
// under the License.
|
|
|
|
#pragma once
|
|
|
|
#include <algorithm>
|
|
#include <cmath>
|
|
#include <random>
|
|
|
|
/** Refer to https://stackoverflow.com/questions/9983239/how-to-generate-zipf-distributed-numbers-efficiently
|
|
* Zipf-like random distribution.
|
|
*
|
|
* "Rejection-inversion to generate variates from monotone discrete
|
|
* distributions", Wolfgang Hörmann and Gerhard Derflinger
|
|
* ACM TOMACS 6.3 (1996): 169-184
|
|
*/
|
|
template <class IntType = unsigned long, class RealType = double>
|
|
class zipf_distribution {
|
|
public:
|
|
typedef RealType input_type;
|
|
typedef IntType result_type;
|
|
|
|
static_assert(std::numeric_limits<IntType>::is_integer, "");
|
|
static_assert(!std::numeric_limits<RealType>::is_integer, "");
|
|
|
|
zipf_distribution(const IntType n = std::numeric_limits<IntType>::max(), const RealType q = 1.0)
|
|
: n(n), q(q), H_x1(H(1.5) - 1.0), H_n(H(n + 0.5)), dist(H_x1, H_n) {}
|
|
|
|
IntType operator()(std::mt19937& rng) {
|
|
while (true) {
|
|
const RealType u = dist(rng);
|
|
const RealType x = H_inv(u);
|
|
const IntType k = clamp<IntType>(std::round(x), 1, n);
|
|
if (u >= H(k + 0.5) - h(k)) {
|
|
return k;
|
|
}
|
|
}
|
|
}
|
|
|
|
private:
|
|
/** Clamp x to [min, max]. */
|
|
template <typename T>
|
|
static constexpr T clamp(const T x, const T min, const T max) {
|
|
return std::max(min, std::min(max, x));
|
|
}
|
|
|
|
/** exp(x) - 1 / x */
|
|
static double expxm1bx(const double x) {
|
|
return (std::abs(x) > epsilon) ? std::expm1(x) / x
|
|
: (1.0 + x / 2.0 * (1.0 + x / 3.0 * (1.0 + x / 4.0)));
|
|
}
|
|
|
|
/** H(x) = log(x) if q == 1, (x^(1-q) - 1)/(1 - q) otherwise.
|
|
* H(x) is an integral of h(x).
|
|
*
|
|
* Note the numerator is one less than in the paper order to work with all
|
|
* positive q.
|
|
*/
|
|
const RealType H(const RealType x) {
|
|
const RealType log_x = std::log(x);
|
|
return expxm1bx((1.0 - q) * log_x) * log_x;
|
|
}
|
|
|
|
/** log(1 + x) / x */
|
|
static RealType log1pxbx(const RealType x) {
|
|
return (std::abs(x) > epsilon) ? std::log1p(x) / x
|
|
: 1.0 - x * ((1 / 2.0) - x * ((1 / 3.0) - x * (1 / 4.0)));
|
|
}
|
|
|
|
/** The inverse function of H(x) */
|
|
const RealType H_inv(const RealType x) {
|
|
const RealType t = std::max(-1.0, x * (1.0 - q));
|
|
return std::exp(log1pxbx(t) * x);
|
|
}
|
|
|
|
/** That hat function h(x) = 1 / (x ^ q) */
|
|
const RealType h(const RealType x) { return std::exp(-q * std::log(x)); }
|
|
|
|
static constexpr RealType epsilon = 1e-8;
|
|
|
|
IntType n; ///< Number of elements
|
|
RealType q; ///< Exponent
|
|
RealType H_x1; ///< H(x_1)
|
|
RealType H_n; ///< H(n)
|
|
std::uniform_real_distribution<RealType> dist; ///< [H(x_1), H(n)]
|
|
}; |