Optimize percentile_approx through radix sort (#2102) (#2107)

This commit is contained in:
shengyunyao
2019-11-05 09:25:47 +08:00
committed by ZHAO Chun
parent d2e34310ef
commit ccc1b9d98c
7 changed files with 590 additions and 10 deletions

320
be/src/util/radix_sort.h Executable file
View File

@ -0,0 +1,320 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
/*
* This implementation of RadixSort is copied from ClickHouse.
* We only reserve some functions which is useful to us and solve some c++11 incompatibility problem.
* We can use this implementation to sort float, double, int, uint and other complex object.
* See original code: https://github.com/ClickHouse/ClickHouse/blob/master/dbms/src/Common/RadixSort.h
*
*/
#ifndef RADIXSORT_H_
#define RADIXSORT_H_
#include <string.h>
#include <malloc.h>
#include <algorithm>
#include <cmath>
#include <cstdlib>
#include <cstdint>
#include <type_traits>
#include "common/compiler_util.h"
namespace doris {
template<typename T >
using decay_t = typename std::decay<T>::type;
template<bool cond, typename T, typename F>
using conditional_t = typename std::conditional<cond, T, F>::type;
template<typename T>
using make_unsigned_t = typename std::make_unsigned<T>::type;
template<typename T>
using is_integral_v = typename std::is_integral<T>::value;
template<typename T>
using is_unsigned_v = typename std::is_unsigned<T>::value;
template <typename To, typename From>
decay_t<To> bit_cast(const From& from) {
To res {};
memcpy(static_cast<void*>(&res), &from, std::min(sizeof(res), sizeof(from)));
return res;
}
/** Radix sort, has the following functionality:
* Can sort unsigned, signed numbers, and floats.
* Can sort an array of fixed length elements that contain something else besides the key.
* Customizable radix size.
*
* LSB, stable.
* NOTE For some applications it makes sense to add MSB-radix-sort,
* as well as radix-select, radix-partial-sort, radix-get-permutation algorithms based on it.
*/
/** Used as a template parameter. See below.
*/
struct RadixSortMallocAllocator {
void * allocate(size_t size) {
return malloc(size);
}
void deallocate(void * ptr, size_t /*size*/) {
return free(ptr);
}
};
/** A transformation that transforms the bit representation of a key into an unsigned integer number,
* that the order relation over the keys will match the order relation over the obtained unsigned numbers.
* For floats this conversion does the following:
* if the signed bit is set, it flips all other bits.
* In this case, NaN-s are bigger than all normal numbers.
*/
template <typename KeyBits>
struct RadixSortFloatTransform {
/// Is it worth writing the result in memory, or is it better to do calculation every time again?
static constexpr bool transform_is_simple = false;
static KeyBits forward(KeyBits x) {
return x ^ ((-(x >> (sizeof(KeyBits) * 8 - 1))) | (KeyBits(1) << (sizeof(KeyBits) * 8 - 1)));
}
static KeyBits backward(KeyBits x) {
return x ^ (((x >> (sizeof(KeyBits) * 8 - 1)) - 1) | (KeyBits(1) << (sizeof(KeyBits) * 8 - 1)));
}
};
template <typename TElement>
struct RadixSortFloatTraits {
using Element = TElement; /// The type of the element. It can be a structure with a key and some other payload. Or just a key.
using Key = Element; /// The key to sort by.
/// Type for calculating histograms. In the case of a known small number of elements, it can be less than size_t.
using CountType = uint32_t;
/// The type to which the key is transformed to do bit operations. This UInt is the same size as the key.
using KeyBits = conditional_t<sizeof(Key) == 8, uint64_t, uint32_t>;
static constexpr size_t PART_SIZE_BITS = 8; /// With what pieces of the key, in bits, to do one pass - reshuffle of the array.
/// Converting a key into KeyBits is such that the order relation over the key corresponds to the order relation over KeyBits.
using Transform = RadixSortFloatTransform<KeyBits>;
/// An object with the functions allocate and deallocate.
/// Can be used, for example, to allocate memory for a temporary array on the stack.
/// To do this, the allocator itself is created on the stack.
using Allocator = RadixSortMallocAllocator;
/// The function to get the key from an array element.
static Key & extractKey(Element & elem) { return elem; }
/// Used when fallback to comparison based sorting is needed.
/// TODO: Correct handling of NaNs, NULLs, etc
static bool less(Key x, Key y) {
return x < y;
}
};
template <typename KeyBits>
struct RadixSortIdentityTransform {
static constexpr bool transform_is_simple = true;
static KeyBits forward(KeyBits x) { return x; }
static KeyBits backward(KeyBits x) { return x; }
};
template <typename TElement>
struct RadixSortUIntTraits {
using Element = TElement;
using Key = Element;
using CountType = uint32_t;
using KeyBits = Key;
static constexpr size_t PART_SIZE_BITS = 8;
using Transform = RadixSortIdentityTransform<KeyBits>;
using Allocator = RadixSortMallocAllocator;
static Key & extractKey(Element & elem) { return elem; }
static bool less(Key x, Key y) {
return x < y;
}
};
template <typename KeyBits>
struct RadixSortSignedTransform
{
static constexpr bool transform_is_simple = true;
static KeyBits forward(KeyBits x) { return x ^ (KeyBits(1) << (sizeof(KeyBits) * 8 - 1)); }
static KeyBits backward(KeyBits x) { return x ^ (KeyBits(1) << (sizeof(KeyBits) * 8 - 1)); }
};
template <typename TElement>
struct RadixSortIntTraits {
using Element = TElement;
using Key = Element;
using CountType = uint32_t;
using KeyBits = make_unsigned_t<Key>;
static constexpr size_t PART_SIZE_BITS = 8;
using Transform = RadixSortSignedTransform<KeyBits>;
using Allocator = RadixSortMallocAllocator;
static Key & extractKey(Element & elem) { return elem; }
static bool less(Key x, Key y) {
return x < y;
}
};
template <typename T>
using RadixSortNumTraits =
conditional_t<std::is_integral<T>::value,
conditional_t<std::is_unsigned<T>::value,
RadixSortUIntTraits<T>,
RadixSortIntTraits<T>>,
RadixSortFloatTraits<T>>;
/*
* To use RadixSort, you should define `Traits` to give out the information for sorting.
* `RadixSortFloatTraits` is a good example to refer to.
* Then you can run it as follows:
* RadixSort<YourTraits>::executeLSD(arr, size);
*
* In particular, if you want to sort an array of numeric, you can use it easily as follows:
* radixSortLSD(array_of_numeric, array_size);
*
* See more use cases: be/test/util/radix_sort_test.cpp
*
*/
template <typename Traits>
struct RadixSort {
private:
using Element = typename Traits::Element;
using Key = typename Traits::Key;
using CountType = typename Traits::CountType;
using KeyBits = typename Traits::KeyBits;
// Use insertion sort if the size of the array is less than equal to this threshold
static constexpr size_t INSERTION_SORT_THRESHOLD = 64;
static constexpr size_t HISTOGRAM_SIZE = 1 << Traits::PART_SIZE_BITS;
static constexpr size_t PART_BITMASK = HISTOGRAM_SIZE - 1;
static constexpr size_t KEY_BITS = sizeof(Key) * 8;
static constexpr size_t NUM_PASSES = (KEY_BITS + (Traits::PART_SIZE_BITS - 1)) / Traits::PART_SIZE_BITS;
static ALWAYS_INLINE KeyBits getPart(size_t N, KeyBits x) {
if (Traits::Transform::transform_is_simple)
x = Traits::Transform::forward(x);
return (x >> (N * Traits::PART_SIZE_BITS)) & PART_BITMASK;
}
static KeyBits keyToBits(Key x) { return bit_cast<KeyBits>(x); }
static Key bitsToKey(KeyBits x) { return bit_cast<Key>(x); }
public:
/// Least significant digit radix sort (stable)
static void executeLSD(Element * arr, size_t size) {
/// If the array is smaller than 256, then it is better to use another algorithm.
/// There are loops of NUM_PASSES. It is very important that they are unfolded at compile-time.
/// For each of the NUM_PASSES bit ranges of the key, consider how many times each value of this bit range met.
CountType histograms[HISTOGRAM_SIZE * NUM_PASSES] = {0};
typename Traits::Allocator allocator;
/// We will do several passes through the array. On each pass, the data is transferred to another array. Let's allocate this temporary array.
Element * swap_buffer = reinterpret_cast<Element *>(allocator.allocate(size * sizeof(Element)));
/// Transform the array and calculate the histogram.
/// NOTE This is slightly suboptimal. Look at https://github.com/powturbo/TurboHist
for (size_t i = 0; i < size; ++i) {
if (!Traits::Transform::transform_is_simple)
Traits::extractKey(arr[i]) = bitsToKey(Traits::Transform::forward(keyToBits(Traits::extractKey(arr[i]))));
for (size_t pass = 0; pass < NUM_PASSES; ++pass)
++histograms[pass * HISTOGRAM_SIZE + getPart(pass, keyToBits(Traits::extractKey(arr[i])))];
}
{
/// Replace the histograms with the accumulated sums: the value in position i is the sum of the previous positions minus one.
size_t sums[NUM_PASSES] = {0};
for (size_t i = 0; i < HISTOGRAM_SIZE; ++i) {
for (size_t pass = 0; pass < NUM_PASSES; ++pass) {
size_t tmp = histograms[pass * HISTOGRAM_SIZE + i] + sums[pass];
histograms[pass * HISTOGRAM_SIZE + i] = sums[pass] - 1;
sums[pass] = tmp;
}
}
}
/// Move the elements in the order starting from the least bit piece, and then do a few passes on the number of pieces.
for (size_t pass = 0; pass < NUM_PASSES; ++pass) {
Element * writer = pass % 2 ? arr : swap_buffer;
Element * reader = pass % 2 ? swap_buffer : arr;
for (size_t i = 0; i < size; ++i) {
size_t pos = getPart(pass, keyToBits(Traits::extractKey(reader[i])));
/// Place the element on the next free position.
auto & dest = writer[++histograms[pass * HISTOGRAM_SIZE + pos]];
dest = reader[i];
/// On the last pass, we do the reverse transformation.
if (!Traits::Transform::transform_is_simple && pass == NUM_PASSES - 1)
Traits::extractKey(dest) = bitsToKey(Traits::Transform::backward(keyToBits(Traits::extractKey(reader[i]))));
}
}
/// If the number of passes is odd, the result array is in a temporary buffer. Copy it to the place of the original array.
/// NOTE Sometimes it will be more optimal to provide non-destructive interface, that will not modify original array.
if (NUM_PASSES % 2)
memcpy(arr, swap_buffer, size * sizeof(Element));
allocator.deallocate(swap_buffer, size * sizeof(Element));
}
};
/// Helper functions for numeric types.
/// Use RadixSort with custom traits for complex types instead.
template <typename T>
void radixSortLSD(T *arr, size_t size) {
RadixSort<RadixSortNumTraits<T>>::executeLSD(arr, size);
}
} // namespace doris
#endif // RADIXSORT_H_

View File

@ -33,10 +33,10 @@
*/
// T-Digest : Percentile and Quantile Estimation of Big Data
// A new data structure for accurate on-line accumulation of rank-based statistics
// A new data structure for accurate on-line accumulation of rank-based statistics
// such as quantiles and trimmed means.
// See original paper: "Computing extremely accurate quantiles using t-digest"
// by Ted Dunning and Otmar Ertl for more details
// See original paper: "Computing extremely accurate quantiles using t-digest"
// by Ted Dunning and Otmar Ertl for more details
// https://github.com/tdunning/t-digest/blob/07b8f2ca2be8d0a9f04df2feadad5ddc1bb73c88/docs/t-digest-paper/histo.pdf.
// https://github.com/derrickburns/tdigest
@ -54,12 +54,13 @@
#include "common/logging.h"
#include "util/debug_util.h"
#include "util/radix_sort.h"
#include "udf/udf.h"
namespace doris {
using Value = double;
using Weight = double;
using Value = float;
using Weight = float;
using Index = size_t;
const size_t kHighWater = 40000;
@ -74,6 +75,10 @@ public:
inline Weight weight() const noexcept { return _weight; }
inline Value& mean() noexcept { return _mean; }
inline Weight& weight() noexcept { return _weight; }
inline void add(const Centroid &c) {
DCHECK_GT(c._weight, 0);
if (_weight != 0.0) {
@ -115,6 +120,22 @@ struct CentroidComparator {
class TDigest {
struct TDigestRadixSortTraits
{
using Element = Centroid;
using Key = Value;
using CountType = uint32_t;
using KeyBits = uint32_t;
static constexpr size_t PART_SIZE_BITS = 8;
using Transform = RadixSortFloatTransform<KeyBits>;
using Allocator = RadixSortMallocAllocator;
static Key & extractKey(Element& elem) { return elem.mean(); }
};
class TDigestComparator {
public:
TDigestComparator() {}
@ -415,7 +436,7 @@ public:
}
}
}
uint32_t serialized_size() {
return sizeof(Value) * 5 + sizeof(Index) * 2 + sizeof(size_t) * 3
+ _processed.size() * sizeof(Centroid)
@ -612,7 +633,7 @@ private:
// when complete, _unprocessed will be empty and _processed will have at most _max_processed centroids
inline void process() {
CentroidComparator cc;
std::sort(_unprocessed.begin(), _unprocessed.end(), cc);
RadixSort<TDigestRadixSortTraits>::executeLSD(_unprocessed.data(), _unprocessed.size());
auto count = _unprocessed.size();
_unprocessed.insert(_unprocessed.end(), _processed.cbegin(), _processed.cend());
std::inplace_merge(_unprocessed.begin(), _unprocessed.begin() + count, _unprocessed.end(), cc);

View File

@ -109,8 +109,7 @@ TEST_F(PercentileApproxTest, testNullVale) {
AggregateFunctions::percentile_approx_init(context, &stringVal2);
AggregateFunctions::percentile_approx_merge(context, serialized, &stringVal2);
DoubleVal v = AggregateFunctions::percentile_approx_finalize(context, stringVal2);
//ASSERT_EQ(v.val, 99900.5);
ASSERT_DOUBLE_EQ(v.val, 99900.665999999997);
ASSERT_FLOAT_EQ(v.val, 99900.665999999997);
}
}

View File

@ -50,3 +50,4 @@ ADD_BE_TEST(arrow/arrow_work_flow_test)
ADD_BE_TEST(counter_cond_variable_test)
ADD_BE_TEST(frame_of_reference_coding_test)
ADD_BE_TEST(bit_stream_utils_test)
ADD_BE_TEST(radix_sort_test)

View File

@ -0,0 +1,238 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include <random>
#include <gtest/gtest.h>
#include <algorithm>
#include <iterator>
#include <iostream>
#include <cstdlib>
#include "util/tdigest.h"
#include "util/radix_sort.h"
namespace doris {
class RadixSortTest : public ::testing::Test {
protected:
// You can remove any or all of the following functions if its body
// is empty.
RadixSortTest() {
// You can do set-up work for each test here.
}
virtual ~RadixSortTest() {
// You can do clean-up work that doesn't throw exceptions here.
}
// If the constructor and destructor are not enough for setting up
// and cleaning up each test, you can define the following methods:
virtual void SetUp() {
// Code here will be called immediately after the constructor (right
// before each test).
}
virtual void TearDown() {
// Code here will be called immediately after each test (right
// before the destructor).
}
static void SetUpTestCase() {
static bool initialized = false;
if (!initialized) {
FLAGS_logtostderr = true;
google::InstallFailureSignalHandler();
google::InitGoogleLogging("testing::RadixSortTest");
initialized = true;
}
}
// Objects declared here can be used by all tests in the test case for Foo.
};
TEST_F(RadixSortTest, TestUint32Sort) {
constexpr size_t num_values = 10000;
std::vector<uint32_t> data;
// generating random data
for (size_t i = 0; i < num_values; ++i) {
data.push_back(num_values - i);
}
std::random_device rd;
std::mt19937 g(rd());
std::shuffle(data.begin(), data.end(), g);
radixSortLSD(data.data(), data.size());
for (size_t i = 0; i < num_values; ++i) {
data[i] = i + 1;
}
}
TEST_F(RadixSortTest, TestInt32Sort) {
constexpr size_t num_values = 10000;
std::vector<int32_t> data;
// generating random data
for (size_t i = 0; i < num_values; ++i) {
data.push_back(num_values - i - 5000);
}
std::random_device rd;
std::mt19937 g(rd());
std::shuffle(data.begin(), data.end(), g);
radixSortLSD(data.data(), data.size());
for (size_t i = 0; i < num_values; ++i) {
data[i] = i + 1 - 5000;
}
}
bool compare_float_with_epsilon(float a, float b, float E) {
return std::abs(a - b) < E;
}
TEST_F(RadixSortTest, TestFloatSort) {
constexpr size_t num_values = 10000;
std::vector<float> data;
// generating random data
for (size_t i = 0; i < num_values; ++i) {
data.push_back(1.0 * num_values - i - 5000 + 0.1);
}
float nan = std::numeric_limits<float>::quiet_NaN();
float max = std::numeric_limits<float>::max();
float min = std::numeric_limits<float>::lowest();
float infinity = std::numeric_limits<float>::infinity();
data.push_back(nan);
data.push_back(max);
data.push_back(min);
data.push_back(infinity);
std::random_device rd;
std::mt19937 g(rd());
std::shuffle(data.begin(), data.end(), g);
radixSortLSD(data.data(), data.size());
for (size_t i = 0; i < num_values + 4; ++i) {
if (i == 0) {
ASSERT_TRUE(compare_float_with_epsilon(data[i], min, 0.0000001));
} else if (i == num_values + 1) {
ASSERT_TRUE(compare_float_with_epsilon(data[i], max, 0.0000001));
} else if (i == num_values + 2) {
ASSERT_TRUE(std::isinf(data[i]));
} else if (i == num_values + 3) {
ASSERT_TRUE(std::isnan(data[i]));
} else {
ASSERT_TRUE(compare_float_with_epsilon(data[i], 1.0 * i - 5000 + 0.1, 0.0000001));
}
}
}
bool compare_double_with_epsilon(double a, double b, double E) {
return std::abs(a - b) < E;
}
TEST_F(RadixSortTest, TestDoubleSort) {
constexpr size_t num_values = 10000;
std::vector<double> data;
// generating random data
for (size_t i = 0; i < num_values; ++i) {
data.push_back(num_values * 1.0 - i - 5000 + 0.1);
}
double nan = std::numeric_limits<double>::quiet_NaN();
double max = std::numeric_limits<double>::max();
double min = std::numeric_limits<double>::lowest();
double infinity = std::numeric_limits<double>::infinity();
data.push_back(nan);
data.push_back(max);
data.push_back(min);
data.push_back(infinity);
std::random_device rd;
std::mt19937 g(rd());
std::shuffle(data.begin(), data.end(), g);
radixSortLSD(data.data(), data.size());
for (size_t i = 0; i < num_values + 4; ++i) {
if (i == 0) {
ASSERT_TRUE(compare_double_with_epsilon(data[i], min, 0.0000001));
} else if (i == num_values + 1) {
ASSERT_TRUE(compare_double_with_epsilon(data[i], max, 0.0000001));
} else if (i == num_values + 2) {
ASSERT_TRUE(std::isinf(data[i]));
} else if (i == num_values + 3) {
ASSERT_TRUE(std::isnan(data[i]));
} else {
double tmp = 1.0 * i - 5000 + 0.1;
ASSERT_TRUE(compare_double_with_epsilon(data[i], tmp, 0.0000001));
}
}
}
struct TestObject {
float d1;
float d2;
};
struct RadixSortTestTraits {
using Element = TestObject;
using Key = float;
using CountType = uint32_t;
using KeyBits = uint32_t;
static constexpr size_t PART_SIZE_BITS = 8;
using Transform = RadixSortFloatTransform<KeyBits>;
using Allocator = RadixSortMallocAllocator;
static Key & extractKey(Element& elem) { return elem.d1; }
};
TEST_F(RadixSortTest, TestObjectSort) {
constexpr size_t num_values = 10000;
std::vector<TestObject> data;
data.resize(10004);
// generating random data
for (size_t i = 0; i < num_values; ++i) {
data[i].d1 = 1.0 * num_values - i - 5000 + 0.1;
}
float nan = std::numeric_limits<float>::quiet_NaN();
float max = std::numeric_limits<float>::max();
float min = std::numeric_limits<float>::lowest();
float infinity = std::numeric_limits<float>::infinity();
data[num_values].d1 = nan;
data[num_values + 1].d1 = max;
data[num_values + 2].d1 = min;
data[num_values + 3].d1 = infinity;
std::random_device rd;
std::mt19937 g(rd());
std::shuffle(data.begin(), data.end(), g);
RadixSort<RadixSortTestTraits>::executeLSD(data.data(), data.size());
for (size_t i = 0; i < num_values + 4; ++i) {
if (i == 0) {
ASSERT_TRUE(compare_float_with_epsilon(data[i].d1, min, 0.0000001));
} else if (i == num_values + 1) {
ASSERT_TRUE(compare_float_with_epsilon(data[i].d1, max, 0.0000001));
} else if (i == num_values + 2) {
ASSERT_TRUE(std::isinf(data[i].d1));
} else if (i == num_values + 3) {
ASSERT_TRUE(std::isnan(data[i].d1));
} else {
float tmp = 1.0 * i - 5000 + 0.1;
ASSERT_TRUE(compare_float_with_epsilon(data[i].d1, tmp, 0.0000001));
}
}
}
} // namespace stesting
int main(int argc, char** argv) {
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}

View File

@ -162,7 +162,7 @@ TEST_F(TDigestTest, MoreThan2BValues) {
const auto count = 1L << 28;
digest.add(next, count);
}
EXPECT_EQ(1000 + 10L * (1 << 28), digest.totalWeight());
EXPECT_EQ(static_cast<long>(1000 + float(10L * (1 << 28))), digest.totalWeight());
EXPECT_GT(digest.totalWeight(), std::numeric_limits<int32_t>::max());
std::vector<double> quantiles{0, 0.1, 0.5, 0.9, 1, reals(gen)};
std::sort(quantiles.begin(), quantiles.end());

View File

@ -154,6 +154,7 @@ ${DORIS_TEST_BINARY_DIR}/util/string_util_test
${DORIS_TEST_BINARY_DIR}/util/coding_test
${DORIS_TEST_BINARY_DIR}/util/faststring_test
${DORIS_TEST_BINARY_DIR}/util/tdigest_test
${DORIS_TEST_BINARY_DIR}/util/radix_sort_test
${DORIS_TEST_BINARY_DIR}/util/block_compression_test
${DORIS_TEST_BINARY_DIR}/util/arrow/arrow_row_block_test
${DORIS_TEST_BINARY_DIR}/util/arrow/arrow_row_batch_test