320
be/src/util/radix_sort.h
Executable file
320
be/src/util/radix_sort.h
Executable file
@ -0,0 +1,320 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
/*
|
||||
* This implementation of RadixSort is copied from ClickHouse.
|
||||
* We only reserve some functions which is useful to us and solve some c++11 incompatibility problem.
|
||||
* We can use this implementation to sort float, double, int, uint and other complex object.
|
||||
* See original code: https://github.com/ClickHouse/ClickHouse/blob/master/dbms/src/Common/RadixSort.h
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef RADIXSORT_H_
|
||||
#define RADIXSORT_H_
|
||||
|
||||
#include <string.h>
|
||||
#include <malloc.h>
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
#include <cstdint>
|
||||
#include <type_traits>
|
||||
#include "common/compiler_util.h"
|
||||
|
||||
namespace doris {
|
||||
|
||||
template<typename T >
|
||||
using decay_t = typename std::decay<T>::type;
|
||||
|
||||
template<bool cond, typename T, typename F>
|
||||
using conditional_t = typename std::conditional<cond, T, F>::type;
|
||||
|
||||
template<typename T>
|
||||
using make_unsigned_t = typename std::make_unsigned<T>::type;
|
||||
|
||||
template<typename T>
|
||||
using is_integral_v = typename std::is_integral<T>::value;
|
||||
|
||||
template<typename T>
|
||||
using is_unsigned_v = typename std::is_unsigned<T>::value;
|
||||
|
||||
template <typename To, typename From>
|
||||
decay_t<To> bit_cast(const From& from) {
|
||||
To res {};
|
||||
memcpy(static_cast<void*>(&res), &from, std::min(sizeof(res), sizeof(from)));
|
||||
return res;
|
||||
}
|
||||
|
||||
/** Radix sort, has the following functionality:
|
||||
* Can sort unsigned, signed numbers, and floats.
|
||||
* Can sort an array of fixed length elements that contain something else besides the key.
|
||||
* Customizable radix size.
|
||||
*
|
||||
* LSB, stable.
|
||||
* NOTE For some applications it makes sense to add MSB-radix-sort,
|
||||
* as well as radix-select, radix-partial-sort, radix-get-permutation algorithms based on it.
|
||||
*/
|
||||
|
||||
|
||||
/** Used as a template parameter. See below.
|
||||
*/
|
||||
struct RadixSortMallocAllocator {
|
||||
void * allocate(size_t size) {
|
||||
return malloc(size);
|
||||
}
|
||||
|
||||
void deallocate(void * ptr, size_t /*size*/) {
|
||||
return free(ptr);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/** A transformation that transforms the bit representation of a key into an unsigned integer number,
|
||||
* that the order relation over the keys will match the order relation over the obtained unsigned numbers.
|
||||
* For floats this conversion does the following:
|
||||
* if the signed bit is set, it flips all other bits.
|
||||
* In this case, NaN-s are bigger than all normal numbers.
|
||||
*/
|
||||
template <typename KeyBits>
|
||||
struct RadixSortFloatTransform {
|
||||
/// Is it worth writing the result in memory, or is it better to do calculation every time again?
|
||||
static constexpr bool transform_is_simple = false;
|
||||
|
||||
static KeyBits forward(KeyBits x) {
|
||||
return x ^ ((-(x >> (sizeof(KeyBits) * 8 - 1))) | (KeyBits(1) << (sizeof(KeyBits) * 8 - 1)));
|
||||
}
|
||||
|
||||
static KeyBits backward(KeyBits x) {
|
||||
return x ^ (((x >> (sizeof(KeyBits) * 8 - 1)) - 1) | (KeyBits(1) << (sizeof(KeyBits) * 8 - 1)));
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template <typename TElement>
|
||||
struct RadixSortFloatTraits {
|
||||
using Element = TElement; /// The type of the element. It can be a structure with a key and some other payload. Or just a key.
|
||||
using Key = Element; /// The key to sort by.
|
||||
|
||||
/// Type for calculating histograms. In the case of a known small number of elements, it can be less than size_t.
|
||||
using CountType = uint32_t;
|
||||
|
||||
/// The type to which the key is transformed to do bit operations. This UInt is the same size as the key.
|
||||
using KeyBits = conditional_t<sizeof(Key) == 8, uint64_t, uint32_t>;
|
||||
|
||||
static constexpr size_t PART_SIZE_BITS = 8; /// With what pieces of the key, in bits, to do one pass - reshuffle of the array.
|
||||
|
||||
/// Converting a key into KeyBits is such that the order relation over the key corresponds to the order relation over KeyBits.
|
||||
using Transform = RadixSortFloatTransform<KeyBits>;
|
||||
|
||||
/// An object with the functions allocate and deallocate.
|
||||
/// Can be used, for example, to allocate memory for a temporary array on the stack.
|
||||
/// To do this, the allocator itself is created on the stack.
|
||||
using Allocator = RadixSortMallocAllocator;
|
||||
|
||||
/// The function to get the key from an array element.
|
||||
static Key & extractKey(Element & elem) { return elem; }
|
||||
|
||||
/// Used when fallback to comparison based sorting is needed.
|
||||
/// TODO: Correct handling of NaNs, NULLs, etc
|
||||
static bool less(Key x, Key y) {
|
||||
return x < y;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template <typename KeyBits>
|
||||
struct RadixSortIdentityTransform {
|
||||
static constexpr bool transform_is_simple = true;
|
||||
|
||||
static KeyBits forward(KeyBits x) { return x; }
|
||||
static KeyBits backward(KeyBits x) { return x; }
|
||||
};
|
||||
|
||||
|
||||
template <typename TElement>
|
||||
struct RadixSortUIntTraits {
|
||||
using Element = TElement;
|
||||
using Key = Element;
|
||||
using CountType = uint32_t;
|
||||
using KeyBits = Key;
|
||||
|
||||
static constexpr size_t PART_SIZE_BITS = 8;
|
||||
|
||||
using Transform = RadixSortIdentityTransform<KeyBits>;
|
||||
using Allocator = RadixSortMallocAllocator;
|
||||
|
||||
static Key & extractKey(Element & elem) { return elem; }
|
||||
|
||||
static bool less(Key x, Key y) {
|
||||
return x < y;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template <typename KeyBits>
|
||||
struct RadixSortSignedTransform
|
||||
{
|
||||
static constexpr bool transform_is_simple = true;
|
||||
|
||||
static KeyBits forward(KeyBits x) { return x ^ (KeyBits(1) << (sizeof(KeyBits) * 8 - 1)); }
|
||||
static KeyBits backward(KeyBits x) { return x ^ (KeyBits(1) << (sizeof(KeyBits) * 8 - 1)); }
|
||||
};
|
||||
|
||||
|
||||
template <typename TElement>
|
||||
struct RadixSortIntTraits {
|
||||
using Element = TElement;
|
||||
using Key = Element;
|
||||
using CountType = uint32_t;
|
||||
using KeyBits = make_unsigned_t<Key>;
|
||||
|
||||
static constexpr size_t PART_SIZE_BITS = 8;
|
||||
|
||||
using Transform = RadixSortSignedTransform<KeyBits>;
|
||||
using Allocator = RadixSortMallocAllocator;
|
||||
|
||||
static Key & extractKey(Element & elem) { return elem; }
|
||||
|
||||
static bool less(Key x, Key y) {
|
||||
return x < y;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
template <typename T>
|
||||
using RadixSortNumTraits =
|
||||
conditional_t<std::is_integral<T>::value,
|
||||
conditional_t<std::is_unsigned<T>::value,
|
||||
RadixSortUIntTraits<T>,
|
||||
RadixSortIntTraits<T>>,
|
||||
RadixSortFloatTraits<T>>;
|
||||
|
||||
/*
|
||||
* To use RadixSort, you should define `Traits` to give out the information for sorting.
|
||||
* `RadixSortFloatTraits` is a good example to refer to.
|
||||
* Then you can run it as follows:
|
||||
* RadixSort<YourTraits>::executeLSD(arr, size);
|
||||
*
|
||||
* In particular, if you want to sort an array of numeric, you can use it easily as follows:
|
||||
* radixSortLSD(array_of_numeric, array_size);
|
||||
*
|
||||
* See more use cases: be/test/util/radix_sort_test.cpp
|
||||
*
|
||||
*/
|
||||
template <typename Traits>
|
||||
struct RadixSort {
|
||||
private:
|
||||
using Element = typename Traits::Element;
|
||||
using Key = typename Traits::Key;
|
||||
using CountType = typename Traits::CountType;
|
||||
using KeyBits = typename Traits::KeyBits;
|
||||
|
||||
// Use insertion sort if the size of the array is less than equal to this threshold
|
||||
static constexpr size_t INSERTION_SORT_THRESHOLD = 64;
|
||||
|
||||
static constexpr size_t HISTOGRAM_SIZE = 1 << Traits::PART_SIZE_BITS;
|
||||
static constexpr size_t PART_BITMASK = HISTOGRAM_SIZE - 1;
|
||||
static constexpr size_t KEY_BITS = sizeof(Key) * 8;
|
||||
static constexpr size_t NUM_PASSES = (KEY_BITS + (Traits::PART_SIZE_BITS - 1)) / Traits::PART_SIZE_BITS;
|
||||
|
||||
static ALWAYS_INLINE KeyBits getPart(size_t N, KeyBits x) {
|
||||
if (Traits::Transform::transform_is_simple)
|
||||
x = Traits::Transform::forward(x);
|
||||
|
||||
return (x >> (N * Traits::PART_SIZE_BITS)) & PART_BITMASK;
|
||||
}
|
||||
|
||||
static KeyBits keyToBits(Key x) { return bit_cast<KeyBits>(x); }
|
||||
static Key bitsToKey(KeyBits x) { return bit_cast<Key>(x); }
|
||||
|
||||
public:
|
||||
/// Least significant digit radix sort (stable)
|
||||
static void executeLSD(Element * arr, size_t size) {
|
||||
/// If the array is smaller than 256, then it is better to use another algorithm.
|
||||
|
||||
/// There are loops of NUM_PASSES. It is very important that they are unfolded at compile-time.
|
||||
|
||||
/// For each of the NUM_PASSES bit ranges of the key, consider how many times each value of this bit range met.
|
||||
CountType histograms[HISTOGRAM_SIZE * NUM_PASSES] = {0};
|
||||
|
||||
typename Traits::Allocator allocator;
|
||||
|
||||
/// We will do several passes through the array. On each pass, the data is transferred to another array. Let's allocate this temporary array.
|
||||
Element * swap_buffer = reinterpret_cast<Element *>(allocator.allocate(size * sizeof(Element)));
|
||||
|
||||
/// Transform the array and calculate the histogram.
|
||||
/// NOTE This is slightly suboptimal. Look at https://github.com/powturbo/TurboHist
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
if (!Traits::Transform::transform_is_simple)
|
||||
Traits::extractKey(arr[i]) = bitsToKey(Traits::Transform::forward(keyToBits(Traits::extractKey(arr[i]))));
|
||||
|
||||
for (size_t pass = 0; pass < NUM_PASSES; ++pass)
|
||||
++histograms[pass * HISTOGRAM_SIZE + getPart(pass, keyToBits(Traits::extractKey(arr[i])))];
|
||||
}
|
||||
|
||||
{
|
||||
/// Replace the histograms with the accumulated sums: the value in position i is the sum of the previous positions minus one.
|
||||
size_t sums[NUM_PASSES] = {0};
|
||||
|
||||
for (size_t i = 0; i < HISTOGRAM_SIZE; ++i) {
|
||||
for (size_t pass = 0; pass < NUM_PASSES; ++pass) {
|
||||
size_t tmp = histograms[pass * HISTOGRAM_SIZE + i] + sums[pass];
|
||||
histograms[pass * HISTOGRAM_SIZE + i] = sums[pass] - 1;
|
||||
sums[pass] = tmp;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Move the elements in the order starting from the least bit piece, and then do a few passes on the number of pieces.
|
||||
for (size_t pass = 0; pass < NUM_PASSES; ++pass) {
|
||||
Element * writer = pass % 2 ? arr : swap_buffer;
|
||||
Element * reader = pass % 2 ? swap_buffer : arr;
|
||||
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
size_t pos = getPart(pass, keyToBits(Traits::extractKey(reader[i])));
|
||||
|
||||
/// Place the element on the next free position.
|
||||
auto & dest = writer[++histograms[pass * HISTOGRAM_SIZE + pos]];
|
||||
dest = reader[i];
|
||||
|
||||
/// On the last pass, we do the reverse transformation.
|
||||
if (!Traits::Transform::transform_is_simple && pass == NUM_PASSES - 1)
|
||||
Traits::extractKey(dest) = bitsToKey(Traits::Transform::backward(keyToBits(Traits::extractKey(reader[i]))));
|
||||
}
|
||||
}
|
||||
|
||||
/// If the number of passes is odd, the result array is in a temporary buffer. Copy it to the place of the original array.
|
||||
/// NOTE Sometimes it will be more optimal to provide non-destructive interface, that will not modify original array.
|
||||
if (NUM_PASSES % 2)
|
||||
memcpy(arr, swap_buffer, size * sizeof(Element));
|
||||
|
||||
allocator.deallocate(swap_buffer, size * sizeof(Element));
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/// Helper functions for numeric types.
|
||||
/// Use RadixSort with custom traits for complex types instead.
|
||||
|
||||
template <typename T>
|
||||
void radixSortLSD(T *arr, size_t size) {
|
||||
RadixSort<RadixSortNumTraits<T>>::executeLSD(arr, size);
|
||||
}
|
||||
|
||||
} // namespace doris
|
||||
|
||||
#endif // RADIXSORT_H_
|
||||
@ -33,10 +33,10 @@
|
||||
*/
|
||||
|
||||
// T-Digest : Percentile and Quantile Estimation of Big Data
|
||||
// A new data structure for accurate on-line accumulation of rank-based statistics
|
||||
// A new data structure for accurate on-line accumulation of rank-based statistics
|
||||
// such as quantiles and trimmed means.
|
||||
// See original paper: "Computing extremely accurate quantiles using t-digest"
|
||||
// by Ted Dunning and Otmar Ertl for more details
|
||||
// See original paper: "Computing extremely accurate quantiles using t-digest"
|
||||
// by Ted Dunning and Otmar Ertl for more details
|
||||
// https://github.com/tdunning/t-digest/blob/07b8f2ca2be8d0a9f04df2feadad5ddc1bb73c88/docs/t-digest-paper/histo.pdf.
|
||||
// https://github.com/derrickburns/tdigest
|
||||
|
||||
@ -54,12 +54,13 @@
|
||||
|
||||
#include "common/logging.h"
|
||||
#include "util/debug_util.h"
|
||||
#include "util/radix_sort.h"
|
||||
#include "udf/udf.h"
|
||||
|
||||
namespace doris {
|
||||
|
||||
using Value = double;
|
||||
using Weight = double;
|
||||
using Value = float;
|
||||
using Weight = float;
|
||||
using Index = size_t;
|
||||
|
||||
const size_t kHighWater = 40000;
|
||||
@ -74,6 +75,10 @@ public:
|
||||
|
||||
inline Weight weight() const noexcept { return _weight; }
|
||||
|
||||
inline Value& mean() noexcept { return _mean; }
|
||||
|
||||
inline Weight& weight() noexcept { return _weight; }
|
||||
|
||||
inline void add(const Centroid &c) {
|
||||
DCHECK_GT(c._weight, 0);
|
||||
if (_weight != 0.0) {
|
||||
@ -115,6 +120,22 @@ struct CentroidComparator {
|
||||
|
||||
class TDigest {
|
||||
|
||||
struct TDigestRadixSortTraits
|
||||
{
|
||||
using Element = Centroid;
|
||||
using Key = Value;
|
||||
using CountType = uint32_t;
|
||||
using KeyBits = uint32_t;
|
||||
|
||||
static constexpr size_t PART_SIZE_BITS = 8;
|
||||
|
||||
using Transform = RadixSortFloatTransform<KeyBits>;
|
||||
using Allocator = RadixSortMallocAllocator;
|
||||
|
||||
static Key & extractKey(Element& elem) { return elem.mean(); }
|
||||
|
||||
};
|
||||
|
||||
class TDigestComparator {
|
||||
public:
|
||||
TDigestComparator() {}
|
||||
@ -415,7 +436,7 @@ public:
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
uint32_t serialized_size() {
|
||||
return sizeof(Value) * 5 + sizeof(Index) * 2 + sizeof(size_t) * 3
|
||||
+ _processed.size() * sizeof(Centroid)
|
||||
@ -612,7 +633,7 @@ private:
|
||||
// when complete, _unprocessed will be empty and _processed will have at most _max_processed centroids
|
||||
inline void process() {
|
||||
CentroidComparator cc;
|
||||
std::sort(_unprocessed.begin(), _unprocessed.end(), cc);
|
||||
RadixSort<TDigestRadixSortTraits>::executeLSD(_unprocessed.data(), _unprocessed.size());
|
||||
auto count = _unprocessed.size();
|
||||
_unprocessed.insert(_unprocessed.end(), _processed.cbegin(), _processed.cend());
|
||||
std::inplace_merge(_unprocessed.begin(), _unprocessed.begin() + count, _unprocessed.end(), cc);
|
||||
|
||||
@ -109,8 +109,7 @@ TEST_F(PercentileApproxTest, testNullVale) {
|
||||
AggregateFunctions::percentile_approx_init(context, &stringVal2);
|
||||
AggregateFunctions::percentile_approx_merge(context, serialized, &stringVal2);
|
||||
DoubleVal v = AggregateFunctions::percentile_approx_finalize(context, stringVal2);
|
||||
//ASSERT_EQ(v.val, 99900.5);
|
||||
ASSERT_DOUBLE_EQ(v.val, 99900.665999999997);
|
||||
ASSERT_FLOAT_EQ(v.val, 99900.665999999997);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -50,3 +50,4 @@ ADD_BE_TEST(arrow/arrow_work_flow_test)
|
||||
ADD_BE_TEST(counter_cond_variable_test)
|
||||
ADD_BE_TEST(frame_of_reference_coding_test)
|
||||
ADD_BE_TEST(bit_stream_utils_test)
|
||||
ADD_BE_TEST(radix_sort_test)
|
||||
|
||||
238
be/test/util/radix_sort_test.cpp
Normal file
238
be/test/util/radix_sort_test.cpp
Normal file
@ -0,0 +1,238 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include <random>
|
||||
#include <gtest/gtest.h>
|
||||
#include <algorithm>
|
||||
#include <iterator>
|
||||
#include <iostream>
|
||||
#include <cstdlib>
|
||||
|
||||
#include "util/tdigest.h"
|
||||
#include "util/radix_sort.h"
|
||||
|
||||
namespace doris {
|
||||
|
||||
class RadixSortTest : public ::testing::Test {
|
||||
protected:
|
||||
// You can remove any or all of the following functions if its body
|
||||
// is empty.
|
||||
RadixSortTest() {
|
||||
// You can do set-up work for each test here.
|
||||
}
|
||||
|
||||
virtual ~RadixSortTest() {
|
||||
// You can do clean-up work that doesn't throw exceptions here.
|
||||
}
|
||||
|
||||
// If the constructor and destructor are not enough for setting up
|
||||
// and cleaning up each test, you can define the following methods:
|
||||
|
||||
virtual void SetUp() {
|
||||
// Code here will be called immediately after the constructor (right
|
||||
// before each test).
|
||||
}
|
||||
|
||||
virtual void TearDown() {
|
||||
// Code here will be called immediately after each test (right
|
||||
// before the destructor).
|
||||
}
|
||||
|
||||
static void SetUpTestCase() {
|
||||
static bool initialized = false;
|
||||
if (!initialized) {
|
||||
FLAGS_logtostderr = true;
|
||||
google::InstallFailureSignalHandler();
|
||||
google::InitGoogleLogging("testing::RadixSortTest");
|
||||
initialized = true;
|
||||
}
|
||||
}
|
||||
|
||||
// Objects declared here can be used by all tests in the test case for Foo.
|
||||
};
|
||||
|
||||
TEST_F(RadixSortTest, TestUint32Sort) {
|
||||
constexpr size_t num_values = 10000;
|
||||
std::vector<uint32_t> data;
|
||||
// generating random data
|
||||
for (size_t i = 0; i < num_values; ++i) {
|
||||
data.push_back(num_values - i);
|
||||
}
|
||||
std::random_device rd;
|
||||
std::mt19937 g(rd());
|
||||
std::shuffle(data.begin(), data.end(), g);
|
||||
radixSortLSD(data.data(), data.size());
|
||||
for (size_t i = 0; i < num_values; ++i) {
|
||||
data[i] = i + 1;
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(RadixSortTest, TestInt32Sort) {
|
||||
constexpr size_t num_values = 10000;
|
||||
std::vector<int32_t> data;
|
||||
// generating random data
|
||||
for (size_t i = 0; i < num_values; ++i) {
|
||||
data.push_back(num_values - i - 5000);
|
||||
}
|
||||
std::random_device rd;
|
||||
std::mt19937 g(rd());
|
||||
std::shuffle(data.begin(), data.end(), g);
|
||||
radixSortLSD(data.data(), data.size());
|
||||
for (size_t i = 0; i < num_values; ++i) {
|
||||
data[i] = i + 1 - 5000;
|
||||
}
|
||||
}
|
||||
|
||||
bool compare_float_with_epsilon(float a, float b, float E) {
|
||||
return std::abs(a - b) < E;
|
||||
}
|
||||
|
||||
TEST_F(RadixSortTest, TestFloatSort) {
|
||||
constexpr size_t num_values = 10000;
|
||||
std::vector<float> data;
|
||||
// generating random data
|
||||
for (size_t i = 0; i < num_values; ++i) {
|
||||
data.push_back(1.0 * num_values - i - 5000 + 0.1);
|
||||
}
|
||||
float nan = std::numeric_limits<float>::quiet_NaN();
|
||||
float max = std::numeric_limits<float>::max();
|
||||
float min = std::numeric_limits<float>::lowest();
|
||||
float infinity = std::numeric_limits<float>::infinity();
|
||||
data.push_back(nan);
|
||||
data.push_back(max);
|
||||
data.push_back(min);
|
||||
data.push_back(infinity);
|
||||
std::random_device rd;
|
||||
std::mt19937 g(rd());
|
||||
std::shuffle(data.begin(), data.end(), g);
|
||||
radixSortLSD(data.data(), data.size());
|
||||
for (size_t i = 0; i < num_values + 4; ++i) {
|
||||
if (i == 0) {
|
||||
ASSERT_TRUE(compare_float_with_epsilon(data[i], min, 0.0000001));
|
||||
} else if (i == num_values + 1) {
|
||||
ASSERT_TRUE(compare_float_with_epsilon(data[i], max, 0.0000001));
|
||||
} else if (i == num_values + 2) {
|
||||
ASSERT_TRUE(std::isinf(data[i]));
|
||||
} else if (i == num_values + 3) {
|
||||
ASSERT_TRUE(std::isnan(data[i]));
|
||||
} else {
|
||||
ASSERT_TRUE(compare_float_with_epsilon(data[i], 1.0 * i - 5000 + 0.1, 0.0000001));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool compare_double_with_epsilon(double a, double b, double E) {
|
||||
return std::abs(a - b) < E;
|
||||
}
|
||||
|
||||
TEST_F(RadixSortTest, TestDoubleSort) {
|
||||
constexpr size_t num_values = 10000;
|
||||
std::vector<double> data;
|
||||
// generating random data
|
||||
for (size_t i = 0; i < num_values; ++i) {
|
||||
data.push_back(num_values * 1.0 - i - 5000 + 0.1);
|
||||
}
|
||||
double nan = std::numeric_limits<double>::quiet_NaN();
|
||||
double max = std::numeric_limits<double>::max();
|
||||
double min = std::numeric_limits<double>::lowest();
|
||||
double infinity = std::numeric_limits<double>::infinity();
|
||||
data.push_back(nan);
|
||||
data.push_back(max);
|
||||
data.push_back(min);
|
||||
data.push_back(infinity);
|
||||
std::random_device rd;
|
||||
std::mt19937 g(rd());
|
||||
std::shuffle(data.begin(), data.end(), g);
|
||||
radixSortLSD(data.data(), data.size());
|
||||
for (size_t i = 0; i < num_values + 4; ++i) {
|
||||
if (i == 0) {
|
||||
ASSERT_TRUE(compare_double_with_epsilon(data[i], min, 0.0000001));
|
||||
} else if (i == num_values + 1) {
|
||||
ASSERT_TRUE(compare_double_with_epsilon(data[i], max, 0.0000001));
|
||||
} else if (i == num_values + 2) {
|
||||
ASSERT_TRUE(std::isinf(data[i]));
|
||||
} else if (i == num_values + 3) {
|
||||
ASSERT_TRUE(std::isnan(data[i]));
|
||||
} else {
|
||||
double tmp = 1.0 * i - 5000 + 0.1;
|
||||
ASSERT_TRUE(compare_double_with_epsilon(data[i], tmp, 0.0000001));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct TestObject {
|
||||
float d1;
|
||||
float d2;
|
||||
};
|
||||
|
||||
struct RadixSortTestTraits {
|
||||
using Element = TestObject;
|
||||
using Key = float;
|
||||
using CountType = uint32_t;
|
||||
using KeyBits = uint32_t;
|
||||
|
||||
static constexpr size_t PART_SIZE_BITS = 8;
|
||||
|
||||
using Transform = RadixSortFloatTransform<KeyBits>;
|
||||
using Allocator = RadixSortMallocAllocator;
|
||||
|
||||
static Key & extractKey(Element& elem) { return elem.d1; }
|
||||
};
|
||||
|
||||
TEST_F(RadixSortTest, TestObjectSort) {
|
||||
constexpr size_t num_values = 10000;
|
||||
std::vector<TestObject> data;
|
||||
data.resize(10004);
|
||||
// generating random data
|
||||
for (size_t i = 0; i < num_values; ++i) {
|
||||
data[i].d1 = 1.0 * num_values - i - 5000 + 0.1;
|
||||
}
|
||||
float nan = std::numeric_limits<float>::quiet_NaN();
|
||||
float max = std::numeric_limits<float>::max();
|
||||
float min = std::numeric_limits<float>::lowest();
|
||||
float infinity = std::numeric_limits<float>::infinity();
|
||||
data[num_values].d1 = nan;
|
||||
data[num_values + 1].d1 = max;
|
||||
data[num_values + 2].d1 = min;
|
||||
data[num_values + 3].d1 = infinity;
|
||||
std::random_device rd;
|
||||
std::mt19937 g(rd());
|
||||
std::shuffle(data.begin(), data.end(), g);
|
||||
RadixSort<RadixSortTestTraits>::executeLSD(data.data(), data.size());
|
||||
for (size_t i = 0; i < num_values + 4; ++i) {
|
||||
if (i == 0) {
|
||||
ASSERT_TRUE(compare_float_with_epsilon(data[i].d1, min, 0.0000001));
|
||||
} else if (i == num_values + 1) {
|
||||
ASSERT_TRUE(compare_float_with_epsilon(data[i].d1, max, 0.0000001));
|
||||
} else if (i == num_values + 2) {
|
||||
ASSERT_TRUE(std::isinf(data[i].d1));
|
||||
} else if (i == num_values + 3) {
|
||||
ASSERT_TRUE(std::isnan(data[i].d1));
|
||||
} else {
|
||||
float tmp = 1.0 * i - 5000 + 0.1;
|
||||
ASSERT_TRUE(compare_float_with_epsilon(data[i].d1, tmp, 0.0000001));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} // namespace stesting
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
::testing::InitGoogleTest(&argc, argv);
|
||||
return RUN_ALL_TESTS();
|
||||
}
|
||||
@ -162,7 +162,7 @@ TEST_F(TDigestTest, MoreThan2BValues) {
|
||||
const auto count = 1L << 28;
|
||||
digest.add(next, count);
|
||||
}
|
||||
EXPECT_EQ(1000 + 10L * (1 << 28), digest.totalWeight());
|
||||
EXPECT_EQ(static_cast<long>(1000 + float(10L * (1 << 28))), digest.totalWeight());
|
||||
EXPECT_GT(digest.totalWeight(), std::numeric_limits<int32_t>::max());
|
||||
std::vector<double> quantiles{0, 0.1, 0.5, 0.9, 1, reals(gen)};
|
||||
std::sort(quantiles.begin(), quantiles.end());
|
||||
|
||||
@ -154,6 +154,7 @@ ${DORIS_TEST_BINARY_DIR}/util/string_util_test
|
||||
${DORIS_TEST_BINARY_DIR}/util/coding_test
|
||||
${DORIS_TEST_BINARY_DIR}/util/faststring_test
|
||||
${DORIS_TEST_BINARY_DIR}/util/tdigest_test
|
||||
${DORIS_TEST_BINARY_DIR}/util/radix_sort_test
|
||||
${DORIS_TEST_BINARY_DIR}/util/block_compression_test
|
||||
${DORIS_TEST_BINARY_DIR}/util/arrow/arrow_row_block_test
|
||||
${DORIS_TEST_BINARY_DIR}/util/arrow/arrow_row_batch_test
|
||||
|
||||
Reference in New Issue
Block a user