doris/be/test/util/tdigest_test.cpp

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

#include "util/tdigest.h"

#include <gtest/gtest-message.h>
#include <gtest/gtest-test-part.h>

#include <memory>
#include <random>

#include "gtest/gtest_pred_impl.h"
#include "testutil/test_util.h"

namespace doris {

class TDigestTest : public ::testing::Test {
protected:
    // You can remove any or all of the following functions if its body
    // is empty.
    TDigestTest() {
        // You can do set-up work for each test here.
    }

    virtual ~TDigestTest() {
        // You can do clean-up work that doesn't throw exceptions here.
    }

    // If the constructor and destructor are not enough for setting up
    // and cleaning up each test, you can define the following methods:

    virtual void SetUp() {
        // Code here will be called immediately after the constructor (right
        // before each test).
    }

    virtual void TearDown() {
        // Code here will be called immediately after each test (right
        // before the destructor).
    }
};

static double quantile(const double q, const std::vector<double>& values) {
    double q1;
    if (values.size() == 0) {
        q1 = NAN;
    } else if (q == 1 || values.size() == 1) {
        q1 = values[values.size() - 1];
    } else {
        auto index = q * values.size();
        if (index < 0.5) {
            q1 = values[0];
        } else if (values.size() - index < 0.5) {
            q1 = values[values.size() - 1];
        } else {
            index -= 0.5;
            const int intIndex = static_cast<int>(index);
            q1 = values[intIndex + 1] * (index - intIndex) +
                 values[intIndex] * (intIndex + 1 - index);
        }
    }
    return q1;
}

TEST_F(TDigestTest, CrashAfterMerge) {
    TDigest digest(1000);
    std::uniform_real_distribution<> reals(0.0, 1.0);
    std::random_device gen;
    for (int i = 0; i < LOOP_LESS_OR_MORE(100, 100000); i++) {
        digest.add(reals(gen));
    }
    digest.compress();

    TDigest digest2(1000);
    digest2.merge(&digest);
    digest2.quantile(0.5);
}

TEST_F(TDigestTest, EmptyDigest) {
    TDigest digest(100);
    EXPECT_EQ(0, digest.processed().size());
}

TEST_F(TDigestTest, SingleValue) {
    TDigest digest(100);
    std::random_device gen;
    std::uniform_real_distribution<> dist(0, 1000);
    const auto value = dist(gen);
    digest.add(value);
    std::uniform_real_distribution<> dist2(0, 1.0);
    const double q = dist2(gen);
    EXPECT_NEAR(value, digest.quantile(0.0), 0.001f);
    EXPECT_NEAR(value, digest.quantile(q), 0.001f);
    EXPECT_NEAR(value, digest.quantile(1.0), 0.001f);
}

TEST_F(TDigestTest, FewValues) {
    // When there are few values in the tree, quantiles should be exact
    TDigest digest(1000);

    std::random_device gen;
    std::uniform_real_distribution<> reals(0.0, 100.0);
    std::uniform_int_distribution<> dist(0, 10);
    std::uniform_int_distribution<> bools(0, 1);
    std::uniform_real_distribution<> qvalue(0.0, 1.0);

    const auto length = 10; //dist(gen);

    std::vector<double> values;
    values.reserve(length);
    for (int i = 0; i < length; ++i) {
        auto const value = (i == 0 || bools(gen)) ? reals(gen) : values[i - 1];
        digest.add(value);
        values.push_back(value);
    }
    std::sort(values.begin(), values.end());
    digest.compress();

    EXPECT_EQ(digest.processed().size(), values.size());

    std::vector<double> testValues {0.0, 1.0e-10, qvalue(gen), 0.5, 1.0 - 1e-10, 1.0};
    for (auto q : testValues) {
        double q1 = quantile(q, values);
        auto q2 = digest.quantile(q);
        if (std::isnan(q1)) {
            EXPECT_TRUE(std::isnan(q2));
        } else {
            EXPECT_NEAR(q1, q2, 0.03) << "q = " << q;
        }
    }
}

TEST_F(TDigestTest, MoreThan2BValues) {
    TDigest digest(1000);

    std::random_device gen;
    std::uniform_real_distribution<> reals(0.0, 1.0);
    for (int i = 0; i < 1000; ++i) {
        const double next = reals(gen);
        digest.add(next);
    }
    for (int i = 0; i < 10; ++i) {
        const double next = reals(gen);
        const auto count = 1L << 28;
        digest.add(next, count);
    }
    EXPECT_EQ(static_cast<long>(1000 + float(10L * (1 << 28))), digest.totalWeight());
    EXPECT_GT(digest.totalWeight(), std::numeric_limits<int32_t>::max());
    std::vector<double> quantiles {0, 0.1, 0.5, 0.9, 1, reals(gen)};
    std::sort(quantiles.begin(), quantiles.end());
    auto prev = std::numeric_limits<double>::min();
    for (double q : quantiles) {
        const double v = digest.quantile(q);
        EXPECT_GE(v, prev) << "q = " << q;
        prev = v;
    }
}

TEST_F(TDigestTest, MergeTest) {
    TDigest digest1(1000);
    TDigest digest2(1000);

    digest2.add(std::vector<const TDigest*> {&digest1});
}

TEST_F(TDigestTest, TestSorted) {
    TDigest digest(1000);
    std::uniform_real_distribution<> reals(0.0, 1.0);
    std::uniform_int_distribution<> ints(0, 10);

    std::random_device gen;
    for (int i = 0; i < 10000; ++i) {
        digest.add(reals(gen), 1 + ints(gen));
    }
    digest.compress();
    Centroid previous(0, 0);
    for (auto centroid : digest.processed()) {
        if (previous.weight() != 0) {
            CHECK_LE(previous.mean(), centroid.mean());
        }
        previous = centroid;
    }
}

TEST_F(TDigestTest, ExtremeQuantiles) {
    TDigest digest(1000);
    // t-digest shouldn't merge extreme nodes, but let's still test how it would
    // answer to extreme quantiles in that case ('extreme' in the sense that the
    // quantile is either before the first node or after the last one)

    digest.add(10, 3);
    digest.add(20, 1);
    digest.add(40, 5);
    // this group tree is roughly equivalent to the following sorted array:
    // [ ?, 10, ?, 20, ?, ?, 50, ?, ? ]
    // and we expect it to compute approximate missing values:
    // [ 5, 10, 15, 20, 30, 40, 50, 60, 70]
    std::vector<double> values {5.0, 10.0, 15.0, 20.0, 30.0, 35.0, 40.0, 45.0, 50.0};
    std::vector<double> quantiles {1.5 / 9.0, 3.5 / 9.0, 6.5 / 9.0};
    for (auto q : quantiles) {
        EXPECT_NEAR(quantile(q, values), digest.quantile(q), 0.01) << "q = " << q;
    }
}

TEST_F(TDigestTest, Montonicity) {
    TDigest digest(1000);
    std::uniform_real_distribution<> reals(0.0, 1.0);
    std::random_device gen;
    for (int i = 0; i < LOOP_LESS_OR_MORE(10, 100000); i++) {
        digest.add(reals(gen));
    }

    double lastQuantile = -1;
    double lastX = -1;
    for (double z = 0; z <= 1; z += LOOP_LESS_OR_MORE(0.1, 1e-5)) {
        double x = digest.quantile(z);
        EXPECT_GE(x, lastX);
        lastX = x;

        double q = digest.cdf(z);
        EXPECT_GE(q, lastQuantile);
        lastQuantile = q;
    }
}

} // namespace doris