Currently, there are some useless includes in the codebase. We can use a tool named include-what-you-use to optimize these includes. By using a strict include-what-you-use policy, we can get lots of benefits from it.
241 lines
7.4 KiB
C++
241 lines
7.4 KiB
C++
// Licensed to the Apache Software Foundation (ASF) under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing,
|
|
// software distributed under the License is distributed on an
|
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
// KIND, either express or implied. See the License for the
|
|
// specific language governing permissions and limitations
|
|
// under the License.
|
|
|
|
#include "util/tdigest.h"
|
|
|
|
#include <gtest/gtest-message.h>
|
|
#include <gtest/gtest-test-part.h>
|
|
|
|
#include <memory>
|
|
#include <random>
|
|
|
|
#include "gtest/gtest_pred_impl.h"
|
|
#include "testutil/test_util.h"
|
|
|
|
namespace doris {
|
|
|
|
class TDigestTest : public ::testing::Test {
|
|
protected:
|
|
// You can remove any or all of the following functions if its body
|
|
// is empty.
|
|
TDigestTest() {
|
|
// You can do set-up work for each test here.
|
|
}
|
|
|
|
virtual ~TDigestTest() {
|
|
// You can do clean-up work that doesn't throw exceptions here.
|
|
}
|
|
|
|
// If the constructor and destructor are not enough for setting up
|
|
// and cleaning up each test, you can define the following methods:
|
|
|
|
virtual void SetUp() {
|
|
// Code here will be called immediately after the constructor (right
|
|
// before each test).
|
|
}
|
|
|
|
virtual void TearDown() {
|
|
// Code here will be called immediately after each test (right
|
|
// before the destructor).
|
|
}
|
|
};
|
|
|
|
static double quantile(const double q, const std::vector<double>& values) {
|
|
double q1;
|
|
if (values.size() == 0) {
|
|
q1 = NAN;
|
|
} else if (q == 1 || values.size() == 1) {
|
|
q1 = values[values.size() - 1];
|
|
} else {
|
|
auto index = q * values.size();
|
|
if (index < 0.5) {
|
|
q1 = values[0];
|
|
} else if (values.size() - index < 0.5) {
|
|
q1 = values[values.size() - 1];
|
|
} else {
|
|
index -= 0.5;
|
|
const int intIndex = static_cast<int>(index);
|
|
q1 = values[intIndex + 1] * (index - intIndex) +
|
|
values[intIndex] * (intIndex + 1 - index);
|
|
}
|
|
}
|
|
return q1;
|
|
}
|
|
|
|
TEST_F(TDigestTest, CrashAfterMerge) {
|
|
TDigest digest(1000);
|
|
std::uniform_real_distribution<> reals(0.0, 1.0);
|
|
std::random_device gen;
|
|
for (int i = 0; i < LOOP_LESS_OR_MORE(100, 100000); i++) {
|
|
digest.add(reals(gen));
|
|
}
|
|
digest.compress();
|
|
|
|
TDigest digest2(1000);
|
|
digest2.merge(&digest);
|
|
digest2.quantile(0.5);
|
|
}
|
|
|
|
TEST_F(TDigestTest, EmptyDigest) {
|
|
TDigest digest(100);
|
|
EXPECT_EQ(0, digest.processed().size());
|
|
}
|
|
|
|
TEST_F(TDigestTest, SingleValue) {
|
|
TDigest digest(100);
|
|
std::random_device gen;
|
|
std::uniform_real_distribution<> dist(0, 1000);
|
|
const auto value = dist(gen);
|
|
digest.add(value);
|
|
std::uniform_real_distribution<> dist2(0, 1.0);
|
|
const double q = dist2(gen);
|
|
EXPECT_NEAR(value, digest.quantile(0.0), 0.001f);
|
|
EXPECT_NEAR(value, digest.quantile(q), 0.001f);
|
|
EXPECT_NEAR(value, digest.quantile(1.0), 0.001f);
|
|
}
|
|
|
|
TEST_F(TDigestTest, FewValues) {
|
|
// When there are few values in the tree, quantiles should be exact
|
|
TDigest digest(1000);
|
|
|
|
std::random_device gen;
|
|
std::uniform_real_distribution<> reals(0.0, 100.0);
|
|
std::uniform_int_distribution<> dist(0, 10);
|
|
std::uniform_int_distribution<> bools(0, 1);
|
|
std::uniform_real_distribution<> qvalue(0.0, 1.0);
|
|
|
|
const auto length = 10; //dist(gen);
|
|
|
|
std::vector<double> values;
|
|
values.reserve(length);
|
|
for (int i = 0; i < length; ++i) {
|
|
auto const value = (i == 0 || bools(gen)) ? reals(gen) : values[i - 1];
|
|
digest.add(value);
|
|
values.push_back(value);
|
|
}
|
|
std::sort(values.begin(), values.end());
|
|
digest.compress();
|
|
|
|
EXPECT_EQ(digest.processed().size(), values.size());
|
|
|
|
std::vector<double> testValues {0.0, 1.0e-10, qvalue(gen), 0.5, 1.0 - 1e-10, 1.0};
|
|
for (auto q : testValues) {
|
|
double q1 = quantile(q, values);
|
|
auto q2 = digest.quantile(q);
|
|
if (std::isnan(q1)) {
|
|
EXPECT_TRUE(std::isnan(q2));
|
|
} else {
|
|
EXPECT_NEAR(q1, q2, 0.03) << "q = " << q;
|
|
}
|
|
}
|
|
}
|
|
|
|
TEST_F(TDigestTest, MoreThan2BValues) {
|
|
TDigest digest(1000);
|
|
|
|
std::random_device gen;
|
|
std::uniform_real_distribution<> reals(0.0, 1.0);
|
|
for (int i = 0; i < 1000; ++i) {
|
|
const double next = reals(gen);
|
|
digest.add(next);
|
|
}
|
|
for (int i = 0; i < 10; ++i) {
|
|
const double next = reals(gen);
|
|
const auto count = 1L << 28;
|
|
digest.add(next, count);
|
|
}
|
|
EXPECT_EQ(static_cast<long>(1000 + float(10L * (1 << 28))), digest.totalWeight());
|
|
EXPECT_GT(digest.totalWeight(), std::numeric_limits<int32_t>::max());
|
|
std::vector<double> quantiles {0, 0.1, 0.5, 0.9, 1, reals(gen)};
|
|
std::sort(quantiles.begin(), quantiles.end());
|
|
auto prev = std::numeric_limits<double>::min();
|
|
for (double q : quantiles) {
|
|
const double v = digest.quantile(q);
|
|
EXPECT_GE(v, prev) << "q = " << q;
|
|
prev = v;
|
|
}
|
|
}
|
|
|
|
TEST_F(TDigestTest, MergeTest) {
|
|
TDigest digest1(1000);
|
|
TDigest digest2(1000);
|
|
|
|
digest2.add(std::vector<const TDigest*> {&digest1});
|
|
}
|
|
|
|
TEST_F(TDigestTest, TestSorted) {
|
|
TDigest digest(1000);
|
|
std::uniform_real_distribution<> reals(0.0, 1.0);
|
|
std::uniform_int_distribution<> ints(0, 10);
|
|
|
|
std::random_device gen;
|
|
for (int i = 0; i < 10000; ++i) {
|
|
digest.add(reals(gen), 1 + ints(gen));
|
|
}
|
|
digest.compress();
|
|
Centroid previous(0, 0);
|
|
for (auto centroid : digest.processed()) {
|
|
if (previous.weight() != 0) {
|
|
CHECK_LE(previous.mean(), centroid.mean());
|
|
}
|
|
previous = centroid;
|
|
}
|
|
}
|
|
|
|
TEST_F(TDigestTest, ExtremeQuantiles) {
|
|
TDigest digest(1000);
|
|
// t-digest shouldn't merge extreme nodes, but let's still test how it would
|
|
// answer to extreme quantiles in that case ('extreme' in the sense that the
|
|
// quantile is either before the first node or after the last one)
|
|
|
|
digest.add(10, 3);
|
|
digest.add(20, 1);
|
|
digest.add(40, 5);
|
|
// this group tree is roughly equivalent to the following sorted array:
|
|
// [ ?, 10, ?, 20, ?, ?, 50, ?, ? ]
|
|
// and we expect it to compute approximate missing values:
|
|
// [ 5, 10, 15, 20, 30, 40, 50, 60, 70]
|
|
std::vector<double> values {5.0, 10.0, 15.0, 20.0, 30.0, 35.0, 40.0, 45.0, 50.0};
|
|
std::vector<double> quantiles {1.5 / 9.0, 3.5 / 9.0, 6.5 / 9.0};
|
|
for (auto q : quantiles) {
|
|
EXPECT_NEAR(quantile(q, values), digest.quantile(q), 0.01) << "q = " << q;
|
|
}
|
|
}
|
|
|
|
TEST_F(TDigestTest, Montonicity) {
|
|
TDigest digest(1000);
|
|
std::uniform_real_distribution<> reals(0.0, 1.0);
|
|
std::random_device gen;
|
|
for (int i = 0; i < LOOP_LESS_OR_MORE(10, 100000); i++) {
|
|
digest.add(reals(gen));
|
|
}
|
|
|
|
double lastQuantile = -1;
|
|
double lastX = -1;
|
|
for (double z = 0; z <= 1; z += LOOP_LESS_OR_MORE(0.1, 1e-5)) {
|
|
double x = digest.quantile(z);
|
|
EXPECT_GE(x, lastX);
|
|
lastX = x;
|
|
|
|
double q = digest.cdf(z);
|
|
EXPECT_GE(q, lastQuantile);
|
|
lastQuantile = q;
|
|
}
|
|
}
|
|
|
|
} // namespace doris
|