From ccc1b9d98c70a529f87a5fa09a2df44192dd0a4d Mon Sep 17 00:00:00 2001
From: shengyunyao <shunyuyao@126.com>
Date: Tue, 5 Nov 2019 09:25:47 +0800
Subject: [PATCH] Optimize percentile_approx through radix sort (#2102) (#2107)

---
 be/src/util/radix_sort.h                 | 320 +++++++++++++++++++++++
 be/src/util/tdigest.h                    |  35 ++-
 be/test/exprs/percentile_approx_test.cpp |   3 +-
 be/test/util/CMakeLists.txt              |   1 +
 be/test/util/radix_sort_test.cpp         | 238 +++++++++++++++++
 be/test/util/tdigest_test.cpp            |   2 +-
 run-ut.sh                                |   1 +
 7 files changed, 590 insertions(+), 10 deletions(-)
 create mode 100755 be/src/util/radix_sort.h
 create mode 100644 be/test/util/radix_sort_test.cpp
diff --git a/be/src/util/radix_sort.h b/be/src/util/radix_sort.h
new file mode 100755
index 0000000000..18d3679686
--- /dev/null
+++ b/be/src/util/radix_sort.h
@@ -0,0 +1,320 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/*
+ * This implementation of RadixSort is copied from ClickHouse.
+ * We only reserve some functions which is useful to us and solve some c++11 incompatibility problem.
+ * We can use this implementation to sort float, double, int, uint and other complex object.
+ * See original code: https://github.com/ClickHouse/ClickHouse/blob/master/dbms/src/Common/RadixSort.h
+ *
+ */
+
+#ifndef RADIXSORT_H_
+#define RADIXSORT_H_
+
+#include <string.h>
+#include <malloc.h>
+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+#include <cstdint>
+#include <type_traits>
+#include "common/compiler_util.h"
+
+namespace doris {
+
+template<typename T >
+using decay_t = typename std::decay<T>::type;
+
+template<bool cond, typename T, typename F>
+using conditional_t = typename std::conditional<cond, T, F>::type;
+
+template<typename T>
+using make_unsigned_t = typename std::make_unsigned<T>::type;
+
+template<typename T>
+using is_integral_v = typename std::is_integral<T>::value;
+
+template<typename T>
+using is_unsigned_v = typename std::is_unsigned<T>::value;
+
+template <typename To, typename From>
+decay_t<To> bit_cast(const From& from) {
+    To res {};
+    memcpy(static_cast<void*>(&res), &from, std::min(sizeof(res), sizeof(from)));
+    return res;
+}
+
+/** Radix sort, has the following functionality:
+  * Can sort unsigned, signed numbers, and floats.
+  * Can sort an array of fixed length elements that contain something else besides the key.
+  * Customizable radix size.
+  *
+  * LSB, stable.
+  * NOTE For some applications it makes sense to add MSB-radix-sort,
+  *  as well as radix-select, radix-partial-sort, radix-get-permutation algorithms based on it.
+  */
+
+
+/** Used as a template parameter. See below.
+  */
+struct RadixSortMallocAllocator {
+    void * allocate(size_t size) {
+        return malloc(size);
+    }
+
+    void deallocate(void * ptr, size_t /*size*/) {
+        return free(ptr);
+    }
+};
+
+
+/** A transformation that transforms the bit representation of a key into an unsigned integer number,
+  *  that the order relation over the keys will match the order relation over the obtained unsigned numbers.
+  * For floats this conversion does the following:
+  *  if the signed bit is set, it flips all other bits.
+  * In this case, NaN-s are bigger than all normal numbers.
+  */
+template <typename KeyBits>
+struct RadixSortFloatTransform {
+    /// Is it worth writing the result in memory, or is it better to do calculation every time again?
+    static constexpr bool transform_is_simple = false;
+
+    static KeyBits forward(KeyBits x) {
+        return x ^ ((-(x >> (sizeof(KeyBits) * 8 - 1))) | (KeyBits(1) << (sizeof(KeyBits) * 8 - 1)));
+    }
+
+    static KeyBits backward(KeyBits x) {
+        return x ^ (((x >> (sizeof(KeyBits) * 8 - 1)) - 1) | (KeyBits(1) << (sizeof(KeyBits) * 8 - 1)));
+    }
+};
+
+
+template <typename TElement>
+struct RadixSortFloatTraits {
+    using Element = TElement;     /// The type of the element. It can be a structure with a key and some other payload. Or just a key.
+    using Key = Element;          /// The key to sort by.
+
+    /// Type for calculating histograms. In the case of a known small number of elements, it can be less than size_t.
+    using CountType = uint32_t;
+
+    /// The type to which the key is transformed to do bit operations. This UInt is the same size as the key.
+    using KeyBits = conditional_t<sizeof(Key) == 8, uint64_t, uint32_t>;
+
+    static constexpr size_t PART_SIZE_BITS = 8;    /// With what pieces of the key, in bits, to do one pass - reshuffle of the array.
+
+    /// Converting a key into KeyBits is such that the order relation over the key corresponds to the order relation over KeyBits.
+    using Transform = RadixSortFloatTransform<KeyBits>;
+
+    /// An object with the functions allocate and deallocate.
+    /// Can be used, for example, to allocate memory for a temporary array on the stack.
+    /// To do this, the allocator itself is created on the stack.
+    using Allocator = RadixSortMallocAllocator;
+
+    /// The function to get the key from an array element.
+    static Key & extractKey(Element & elem) { return elem; }
+
+    /// Used when fallback to comparison based sorting is needed.
+    /// TODO: Correct handling of NaNs, NULLs, etc
+    static bool less(Key x, Key y) {
+        return x < y;
+    }
+};
+
+
+template <typename KeyBits>
+struct RadixSortIdentityTransform {
+    static constexpr bool transform_is_simple = true;
+
+    static KeyBits forward(KeyBits x)     { return x; }
+    static KeyBits backward(KeyBits x)    { return x; }
+};
+
+
+template <typename TElement>
+struct RadixSortUIntTraits {
+    using Element = TElement;
+    using Key = Element;
+    using CountType = uint32_t;
+    using KeyBits = Key;
+
+    static constexpr size_t PART_SIZE_BITS = 8;
+
+    using Transform = RadixSortIdentityTransform<KeyBits>;
+    using Allocator = RadixSortMallocAllocator;
+
+    static Key & extractKey(Element & elem) { return elem; }
+
+    static bool less(Key x, Key y) {
+        return x < y;
+    }
+};
+
+
+template <typename KeyBits>
+struct RadixSortSignedTransform
+{
+    static constexpr bool transform_is_simple = true;
+
+    static KeyBits forward(KeyBits x)     { return x ^ (KeyBits(1) << (sizeof(KeyBits) * 8 - 1)); }
+    static KeyBits backward(KeyBits x)    { return x ^ (KeyBits(1) << (sizeof(KeyBits) * 8 - 1)); }
+};
+
+
+template <typename TElement>
+struct RadixSortIntTraits {
+    using Element = TElement;
+    using Key = Element;
+    using CountType = uint32_t;
+    using KeyBits = make_unsigned_t<Key>;
+
+    static constexpr size_t PART_SIZE_BITS = 8;
+
+    using Transform = RadixSortSignedTransform<KeyBits>;
+    using Allocator = RadixSortMallocAllocator;
+
+    static Key & extractKey(Element & elem) { return elem; }
+
+    static bool less(Key x, Key y) {
+        return x < y;
+    }
+};
+
+
+template <typename T>
+using RadixSortNumTraits =
+    conditional_t<std::is_integral<T>::value,
+        conditional_t<std::is_unsigned<T>::value,
+            RadixSortUIntTraits<T>,
+            RadixSortIntTraits<T>>,
+        RadixSortFloatTraits<T>>;
+
+/*
+ * To use RadixSort, you should define `Traits` to give out the information for sorting.
+ * `RadixSortFloatTraits` is a good example to refer to.
+ * Then you can run it as follows:
+ *           RadixSort<YourTraits>::executeLSD(arr, size);
+ *
+ * In particular, if you want to sort an array of numeric, you can use it easily as follows:
+ *           radixSortLSD(array_of_numeric, array_size);
+ *
+ * See more use cases: be/test/util/radix_sort_test.cpp
+ *
+ */
+template <typename Traits>
+struct RadixSort {
+private:
+    using Element     = typename Traits::Element;
+    using Key         = typename Traits::Key;
+    using CountType   = typename Traits::CountType;
+    using KeyBits     = typename Traits::KeyBits;
+
+    // Use insertion sort if the size of the array is less than equal to this threshold
+    static constexpr size_t INSERTION_SORT_THRESHOLD = 64;
+
+    static constexpr size_t HISTOGRAM_SIZE = 1 << Traits::PART_SIZE_BITS;
+    static constexpr size_t PART_BITMASK = HISTOGRAM_SIZE - 1;
+    static constexpr size_t KEY_BITS = sizeof(Key) * 8;
+    static constexpr size_t NUM_PASSES = (KEY_BITS + (Traits::PART_SIZE_BITS - 1)) / Traits::PART_SIZE_BITS;
+
+    static ALWAYS_INLINE KeyBits getPart(size_t N, KeyBits x) {
+        if (Traits::Transform::transform_is_simple)
+            x = Traits::Transform::forward(x);
+
+        return (x >> (N * Traits::PART_SIZE_BITS)) & PART_BITMASK;
+    }
+
+    static KeyBits keyToBits(Key x) { return bit_cast<KeyBits>(x); }
+    static Key bitsToKey(KeyBits x) { return bit_cast<Key>(x); }
+
+public:
+    /// Least significant digit radix sort (stable)
+    static void executeLSD(Element * arr, size_t size) {
+        /// If the array is smaller than 256, then it is better to use another algorithm.
+
+        /// There are loops of NUM_PASSES. It is very important that they are unfolded at compile-time.
+
+        /// For each of the NUM_PASSES bit ranges of the key, consider how many times each value of this bit range met.
+        CountType histograms[HISTOGRAM_SIZE * NUM_PASSES] = {0};
+
+        typename Traits::Allocator allocator;
+
+        /// We will do several passes through the array. On each pass, the data is transferred to another array. Let's allocate this temporary array.
+        Element * swap_buffer = reinterpret_cast<Element *>(allocator.allocate(size * sizeof(Element)));
+
+        /// Transform the array and calculate the histogram.
+        /// NOTE This is slightly suboptimal. Look at https://github.com/powturbo/TurboHist
+        for (size_t i = 0; i < size; ++i) {
+            if (!Traits::Transform::transform_is_simple)
+                Traits::extractKey(arr[i]) = bitsToKey(Traits::Transform::forward(keyToBits(Traits::extractKey(arr[i]))));
+
+            for (size_t pass = 0; pass < NUM_PASSES; ++pass)
+                ++histograms[pass * HISTOGRAM_SIZE + getPart(pass, keyToBits(Traits::extractKey(arr[i])))];
+        }
+
+        {
+            /// Replace the histograms with the accumulated sums: the value in position i is the sum of the previous positions minus one.
+            size_t sums[NUM_PASSES] = {0};
+
+            for (size_t i = 0; i < HISTOGRAM_SIZE; ++i) {
+                for (size_t pass = 0; pass < NUM_PASSES; ++pass) {
+                    size_t tmp = histograms[pass * HISTOGRAM_SIZE + i] + sums[pass];
+                    histograms[pass * HISTOGRAM_SIZE + i] = sums[pass] - 1;
+                    sums[pass] = tmp;
+                }
+            }
+        }
+
+        /// Move the elements in the order starting from the least bit piece, and then do a few passes on the number of pieces.
+        for (size_t pass = 0; pass < NUM_PASSES; ++pass) {
+            Element * writer = pass % 2 ? arr : swap_buffer;
+            Element * reader = pass % 2 ? swap_buffer : arr;
+
+            for (size_t i = 0; i < size; ++i) {
+                size_t pos = getPart(pass, keyToBits(Traits::extractKey(reader[i])));
+
+                /// Place the element on the next free position.
+                auto & dest = writer[++histograms[pass * HISTOGRAM_SIZE + pos]];
+                dest = reader[i];
+
+                /// On the last pass, we do the reverse transformation.
+                if (!Traits::Transform::transform_is_simple && pass == NUM_PASSES - 1)
+                    Traits::extractKey(dest) = bitsToKey(Traits::Transform::backward(keyToBits(Traits::extractKey(reader[i]))));
+            }
+        }
+
+        /// If the number of passes is odd, the result array is in a temporary buffer. Copy it to the place of the original array.
+        /// NOTE Sometimes it will be more optimal to provide non-destructive interface, that will not modify original array.
+        if (NUM_PASSES % 2)
+            memcpy(arr, swap_buffer, size * sizeof(Element));
+
+        allocator.deallocate(swap_buffer, size * sizeof(Element));
+    }
+};
+
+
+/// Helper functions for numeric types.
+/// Use RadixSort with custom traits for complex types instead.
+
+template <typename T>
+void radixSortLSD(T *arr, size_t size) {
+    RadixSort<RadixSortNumTraits<T>>::executeLSD(arr, size);
+}
+
+} // namespace doris
+
+#endif // RADIXSORT_H_
diff --git a/be/src/util/tdigest.h b/be/src/util/tdigest.h
index d14fa221a5..57cb4159bb 100644
--- a/be/src/util/tdigest.h
+++ b/be/src/util/tdigest.h
@@ -33,10 +33,10 @@
  */
 
 // T-Digest :  Percentile and Quantile Estimation of Big Data
-// A new data structure for accurate on-line accumulation of rank-based statistics 
+// A new data structure for accurate on-line accumulation of rank-based statistics
 // such as quantiles and trimmed means.
-// See original paper: "Computing extremely accurate quantiles using t-digest" 
-// by Ted Dunning and Otmar Ertl for more details 
+// See original paper: "Computing extremely accurate quantiles using t-digest"
+// by Ted Dunning and Otmar Ertl for more details
 // https://github.com/tdunning/t-digest/blob/07b8f2ca2be8d0a9f04df2feadad5ddc1bb73c88/docs/t-digest-paper/histo.pdf.
 // https://github.com/derrickburns/tdigest
 
@@ -54,12 +54,13 @@
 
 #include "common/logging.h"
 #include "util/debug_util.h"
+#include "util/radix_sort.h"
 #include "udf/udf.h"
 
 namespace doris {
 
-using Value = double;
-using Weight = double;
+using Value = float;
+using Weight = float;
 using Index = size_t;
 
 const size_t kHighWater = 40000;
@@ -74,6 +75,10 @@ public:
 
     inline Weight weight() const noexcept { return _weight; }
 
+    inline Value& mean() noexcept { return _mean; }
+
+    inline Weight& weight() noexcept { return _weight; }
+
     inline void add(const Centroid &c) {
         DCHECK_GT(c._weight, 0);
         if (_weight != 0.0) {
@@ -115,6 +120,22 @@ struct CentroidComparator {
 
 class TDigest {
 
+    struct TDigestRadixSortTraits
+    {
+        using Element = Centroid;
+        using Key = Value;
+        using CountType = uint32_t;
+        using KeyBits = uint32_t;
+
+        static constexpr size_t PART_SIZE_BITS = 8;
+
+        using Transform = RadixSortFloatTransform<KeyBits>;
+        using Allocator = RadixSortMallocAllocator;
+
+        static Key & extractKey(Element& elem) { return elem.mean();   }
+
+    };
+
     class TDigestComparator {
     public:
         TDigestComparator() {}
@@ -415,7 +436,7 @@ public:
             }
         }
     }
-        
+
     uint32_t serialized_size() {
         return sizeof(Value) * 5 + sizeof(Index) * 2 + sizeof(size_t) * 3
                + _processed.size() * sizeof(Centroid)
@@ -612,7 +633,7 @@ private:
     // when complete, _unprocessed will be empty and _processed will have at most _max_processed centroids
     inline void process() {
         CentroidComparator cc;
-        std::sort(_unprocessed.begin(), _unprocessed.end(), cc);
+        RadixSort<TDigestRadixSortTraits>::executeLSD(_unprocessed.data(), _unprocessed.size());
         auto count = _unprocessed.size();
         _unprocessed.insert(_unprocessed.end(), _processed.cbegin(), _processed.cend());
         std::inplace_merge(_unprocessed.begin(), _unprocessed.begin() + count, _unprocessed.end(), cc);
diff --git a/be/test/exprs/percentile_approx_test.cpp b/be/test/exprs/percentile_approx_test.cpp
index d52298ad54..7b9b6422c9 100644
--- a/be/test/exprs/percentile_approx_test.cpp
+++ b/be/test/exprs/percentile_approx_test.cpp
@@ -109,8 +109,7 @@ TEST_F(PercentileApproxTest, testNullVale) {
     AggregateFunctions::percentile_approx_init(context, &stringVal2);
     AggregateFunctions::percentile_approx_merge(context, serialized, &stringVal2);
     DoubleVal v = AggregateFunctions::percentile_approx_finalize(context, stringVal2);
-    //ASSERT_EQ(v.val, 99900.5);
-    ASSERT_DOUBLE_EQ(v.val, 99900.665999999997);
+    ASSERT_FLOAT_EQ(v.val, 99900.665999999997);
 }
 
 }
diff --git a/be/test/util/CMakeLists.txt b/be/test/util/CMakeLists.txt
index fee8a70cd0..bc78c18256 100644
--- a/be/test/util/CMakeLists.txt
+++ b/be/test/util/CMakeLists.txt
@@ -50,3 +50,4 @@ ADD_BE_TEST(arrow/arrow_work_flow_test)
 ADD_BE_TEST(counter_cond_variable_test)
 ADD_BE_TEST(frame_of_reference_coding_test)
 ADD_BE_TEST(bit_stream_utils_test)
+ADD_BE_TEST(radix_sort_test)
diff --git a/be/test/util/radix_sort_test.cpp b/be/test/util/radix_sort_test.cpp
new file mode 100644
index 0000000000..bb46ff9f76
--- /dev/null
+++ b/be/test/util/radix_sort_test.cpp
@@ -0,0 +1,238 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <random>
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <iterator>
+#include <iostream>
+#include <cstdlib>
+
+#include "util/tdigest.h"
+#include "util/radix_sort.h"
+
+namespace doris {
+
+class RadixSortTest : public ::testing::Test {
+protected:
+    // You can remove any or all of the following functions if its body
+    // is empty.
+    RadixSortTest() {
+        // You can do set-up work for each test here.
+    }
+
+    virtual ~RadixSortTest() {
+        // You can do clean-up work that doesn't throw exceptions here.
+    }
+
+    // If the constructor and destructor are not enough for setting up
+    // and cleaning up each test, you can define the following methods:
+
+    virtual void SetUp() {
+        // Code here will be called immediately after the constructor (right
+        // before each test).
+    }
+
+    virtual void TearDown() {
+        // Code here will be called immediately after each test (right
+        // before the destructor).
+    }
+
+    static void SetUpTestCase() {
+        static bool initialized = false;
+        if (!initialized) {
+            FLAGS_logtostderr = true;
+            google::InstallFailureSignalHandler();
+            google::InitGoogleLogging("testing::RadixSortTest");
+            initialized = true;
+        }
+    }
+
+    // Objects declared here can be used by all tests in the test case for Foo.
+};
+
+TEST_F(RadixSortTest, TestUint32Sort) {
+    constexpr size_t num_values = 10000;
+    std::vector<uint32_t> data;
+    // generating random data
+    for (size_t i = 0; i < num_values; ++i) {
+        data.push_back(num_values - i);
+    }
+    std::random_device rd;
+    std::mt19937 g(rd());
+    std::shuffle(data.begin(), data.end(), g);
+    radixSortLSD(data.data(), data.size());
+    for (size_t i = 0; i < num_values; ++i) {
+        data[i] = i + 1;
+    }
+}
+
+TEST_F(RadixSortTest, TestInt32Sort) {
+    constexpr size_t num_values = 10000;
+    std::vector<int32_t> data;
+    // generating random data
+    for (size_t i = 0; i < num_values; ++i) {
+        data.push_back(num_values - i - 5000);
+    }
+    std::random_device rd;
+    std::mt19937 g(rd());
+    std::shuffle(data.begin(), data.end(), g);
+    radixSortLSD(data.data(), data.size());
+    for (size_t i = 0; i < num_values; ++i) {
+        data[i] = i + 1 - 5000;
+    }
+}
+
+bool compare_float_with_epsilon(float a, float b, float E) {
+    return std::abs(a - b) < E;
+}
+
+TEST_F(RadixSortTest, TestFloatSort) {
+    constexpr size_t num_values = 10000;
+    std::vector<float> data;
+    // generating random data
+    for (size_t i = 0; i < num_values; ++i) {
+        data.push_back(1.0 * num_values - i - 5000 + 0.1);
+    }
+    float nan = std::numeric_limits<float>::quiet_NaN();
+    float max = std::numeric_limits<float>::max();
+    float min = std::numeric_limits<float>::lowest();
+    float infinity = std::numeric_limits<float>::infinity();
+    data.push_back(nan);
+    data.push_back(max);
+    data.push_back(min);
+    data.push_back(infinity);
+    std::random_device rd;
+    std::mt19937 g(rd());
+    std::shuffle(data.begin(), data.end(), g);
+    radixSortLSD(data.data(), data.size());
+    for (size_t i = 0; i < num_values + 4; ++i) {
+        if (i == 0) {
+            ASSERT_TRUE(compare_float_with_epsilon(data[i], min, 0.0000001));
+        } else if (i == num_values + 1) {
+            ASSERT_TRUE(compare_float_with_epsilon(data[i], max, 0.0000001));
+        } else if (i == num_values + 2) {
+            ASSERT_TRUE(std::isinf(data[i]));
+        } else if (i == num_values + 3) {
+            ASSERT_TRUE(std::isnan(data[i]));
+        } else {
+            ASSERT_TRUE(compare_float_with_epsilon(data[i], 1.0 * i - 5000 + 0.1, 0.0000001));
+        }
+    }
+}
+
+bool compare_double_with_epsilon(double a, double b, double E) {
+    return std::abs(a - b) < E;
+}
+
+TEST_F(RadixSortTest, TestDoubleSort) {
+    constexpr size_t num_values = 10000;
+    std::vector<double> data;
+    // generating random data
+    for (size_t i = 0; i < num_values; ++i) {
+        data.push_back(num_values * 1.0 - i - 5000 + 0.1);
+    }
+    double nan = std::numeric_limits<double>::quiet_NaN();
+    double max = std::numeric_limits<double>::max();
+    double min = std::numeric_limits<double>::lowest();
+    double infinity = std::numeric_limits<double>::infinity();
+    data.push_back(nan);
+    data.push_back(max);
+    data.push_back(min);
+    data.push_back(infinity);
+    std::random_device rd;
+    std::mt19937 g(rd());
+    std::shuffle(data.begin(), data.end(), g);
+    radixSortLSD(data.data(), data.size());
+    for (size_t i = 0; i < num_values + 4; ++i) {
+        if (i == 0) {
+            ASSERT_TRUE(compare_double_with_epsilon(data[i], min, 0.0000001));
+        } else if (i == num_values + 1) {
+            ASSERT_TRUE(compare_double_with_epsilon(data[i], max, 0.0000001));
+        } else if (i == num_values + 2) {
+            ASSERT_TRUE(std::isinf(data[i]));
+        } else if (i == num_values + 3) {
+            ASSERT_TRUE(std::isnan(data[i]));
+        } else {
+            double tmp = 1.0 * i - 5000 + 0.1;
+            ASSERT_TRUE(compare_double_with_epsilon(data[i], tmp, 0.0000001));
+        }
+    }
+}
+
+struct TestObject {
+    float d1;
+    float d2;
+};
+
+struct RadixSortTestTraits {
+    using Element = TestObject;
+    using Key = float;
+    using CountType = uint32_t;
+    using KeyBits = uint32_t;
+
+    static constexpr size_t PART_SIZE_BITS = 8;
+
+    using Transform = RadixSortFloatTransform<KeyBits>;
+    using Allocator = RadixSortMallocAllocator;
+
+    static Key & extractKey(Element& elem) { return elem.d1;    }
+};
+
+TEST_F(RadixSortTest, TestObjectSort) {
+    constexpr size_t num_values = 10000;
+    std::vector<TestObject> data;
+    data.resize(10004);
+    // generating random data
+    for (size_t i = 0; i < num_values; ++i) {
+        data[i].d1 = 1.0 * num_values - i - 5000 + 0.1;
+    }
+    float nan = std::numeric_limits<float>::quiet_NaN();
+    float max = std::numeric_limits<float>::max();
+    float min = std::numeric_limits<float>::lowest();
+    float infinity = std::numeric_limits<float>::infinity();
+    data[num_values].d1 = nan;
+    data[num_values + 1].d1 = max;
+    data[num_values + 2].d1 = min;
+    data[num_values + 3].d1 = infinity;
+    std::random_device rd;
+    std::mt19937 g(rd());
+    std::shuffle(data.begin(), data.end(), g);
+    RadixSort<RadixSortTestTraits>::executeLSD(data.data(), data.size());
+    for (size_t i = 0; i < num_values + 4; ++i) {
+        if (i == 0) {
+            ASSERT_TRUE(compare_float_with_epsilon(data[i].d1, min, 0.0000001));
+        } else if (i == num_values + 1) {
+            ASSERT_TRUE(compare_float_with_epsilon(data[i].d1, max, 0.0000001));
+        } else if (i == num_values + 2) {
+            ASSERT_TRUE(std::isinf(data[i].d1));
+        } else if (i == num_values + 3) {
+            ASSERT_TRUE(std::isnan(data[i].d1));
+        } else {
+            float tmp = 1.0 * i - 5000 + 0.1;
+            ASSERT_TRUE(compare_float_with_epsilon(data[i].d1, tmp, 0.0000001));
+        }
+    }
+}
+
+
+}  // namespace stesting
+
+int main(int argc, char** argv) {
+    ::testing::InitGoogleTest(&argc, argv);
+    return RUN_ALL_TESTS();
+}
diff --git a/be/test/util/tdigest_test.cpp b/be/test/util/tdigest_test.cpp
index eeba09563e..4535a84180 100644
--- a/be/test/util/tdigest_test.cpp
+++ b/be/test/util/tdigest_test.cpp
@@ -162,7 +162,7 @@ TEST_F(TDigestTest, MoreThan2BValues) {
         const auto count = 1L << 28;
         digest.add(next, count);
     }
-    EXPECT_EQ(1000 + 10L * (1 << 28), digest.totalWeight());
+    EXPECT_EQ(static_cast<long>(1000 + float(10L * (1 << 28))), digest.totalWeight());
     EXPECT_GT(digest.totalWeight(), std::numeric_limits<int32_t>::max());
     std::vector<double> quantiles{0, 0.1, 0.5, 0.9, 1, reals(gen)};
     std::sort(quantiles.begin(), quantiles.end());
diff --git a/run-ut.sh b/run-ut.sh
index d4f446234d..c5b498c8d1 100755
--- a/run-ut.sh
+++ b/run-ut.sh
@@ -154,6 +154,7 @@ ${DORIS_TEST_BINARY_DIR}/util/string_util_test
 ${DORIS_TEST_BINARY_DIR}/util/coding_test
 ${DORIS_TEST_BINARY_DIR}/util/faststring_test
 ${DORIS_TEST_BINARY_DIR}/util/tdigest_test
+${DORIS_TEST_BINARY_DIR}/util/radix_sort_test
 ${DORIS_TEST_BINARY_DIR}/util/block_compression_test
 ${DORIS_TEST_BINARY_DIR}/util/arrow/arrow_row_block_test
 ${DORIS_TEST_BINARY_DIR}/util/arrow/arrow_row_batch_test