doris/be/src/vec/functions/round.h

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.
// This file is copied from
// https://github.com/ClickHouse/ClickHouse/blob/master/src/Functions/FunctionRound.h
// and modified by Doris

#pragma once

#ifdef __SSE4_1__
#include <smmintrin.h>
#else
#include <fenv.h>
#endif

#include "vec/columns/column.h"
#include "vec/columns/column_decimal.h"
#include "vec/data_types/data_type_decimal.h"
#include "vec/data_types/data_type_number.h"

namespace doris::vectorized {

enum class ScaleMode {
    Positive, // round to a number with N decimal places after the decimal point
    Negative, // round to an integer with N zero characters
    Zero,     // round to an integer
};

enum class RoundingMode {
#ifdef __SSE4_1__
    Round = _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC,
    Floor = _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC,
    Ceil = _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC,
    Trunc = _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC,
#else
    Round = 8, /// Values are correspond to above just in case.
    Floor = 9,
    Ceil = 10,
    Trunc = 11,
#endif
};

enum class TieBreakingMode {
    Auto,    // use banker's rounding for floating point numbers, round up otherwise
    Bankers, // use banker's rounding
};

template <typename T, RoundingMode rounding_mode, ScaleMode scale_mode,
          TieBreakingMode tie_breaking_mode>
struct IntegerRoundingComputation {
    static const size_t data_count = 1;

    static size_t prepare(size_t scale) { return scale; }

    /// Integer overflow is Ok.
    static ALWAYS_INLINE T compute_impl(T x, T scale) {
        switch (rounding_mode) {
        case RoundingMode::Trunc: {
            return x / scale * scale;
        }
        case RoundingMode::Floor: {
            if (x < 0) {
                x -= scale - 1;
            }
            return x / scale * scale;
        }
        case RoundingMode::Ceil: {
            if (x >= 0) {
                x += scale - 1;
            }
            return x / scale * scale;
        }
        case RoundingMode::Round: {
            if (x < 0) {
                x -= scale;
            }
            switch (tie_breaking_mode) {
            case TieBreakingMode::Auto: {
                x = (x + scale / 2) / scale * scale;
                break;
            }
            case TieBreakingMode::Bankers: {
                T quotient = (x + scale / 2) / scale;
                if (quotient * scale == x + scale / 2) {
                    // round half to even
                    x = ((quotient + (x < 0)) & ~1) * scale;
                } else {
                    // round the others as usual
                    x = quotient * scale;
                }
                break;
            }
            }
            return x;
        }
        }

        __builtin_unreachable();
    }

    static ALWAYS_INLINE T compute(T x, T scale) {
        switch (scale_mode) {
        case ScaleMode::Zero:
        case ScaleMode::Positive:
            return x;
        case ScaleMode::Negative:
            return compute_impl(x, scale);
        }

        __builtin_unreachable();
    }

    static ALWAYS_INLINE void compute(const T* __restrict in, size_t scale, T* __restrict out) {
        if constexpr (sizeof(T) <= sizeof(scale) && scale_mode == ScaleMode::Negative) {
            if (scale > size_t(std::numeric_limits<T>::max())) {
                *out = 0;
                return;
            }
        }
        *out = compute(*in, scale);
    }
};

template <typename T, RoundingMode rounding_mode, TieBreakingMode tie_breaking_mode>
class DecimalRoundingImpl {
private:
    using NativeType = typename T::NativeType;
    using Op = IntegerRoundingComputation<NativeType, rounding_mode, ScaleMode::Negative,
                                          tie_breaking_mode>;
    using Container = typename ColumnDecimal<T>::Container;

public:
    static NO_INLINE void apply(const Container& in, UInt32 in_scale, Container& out,
                                Int16 scale_arg) {
        scale_arg = in_scale - scale_arg;
        if (scale_arg > 0) {
            size_t scale = int_exp10(scale_arg);

            const NativeType* __restrict p_in = reinterpret_cast<const NativeType*>(in.data());
            const NativeType* end_in = reinterpret_cast<const NativeType*>(in.data()) + in.size();
            NativeType* __restrict p_out = reinterpret_cast<NativeType*>(out.data());

            while (p_in < end_in) {
                Op::compute(p_in, scale, p_out);
                ++p_in;
                ++p_out;
            }
        } else {
            memcpy(out.data(), in.data(), in.size() * sizeof(T));
        }
    }
};

#ifdef __SSE4_1__

template <typename T>
class BaseFloatRoundingComputation;

template <>
class BaseFloatRoundingComputation<Float32> {
public:
    using ScalarType = Float32;
    using VectorType = __m128;
    static const size_t data_count = 4;

    static VectorType load(const ScalarType* in) { return _mm_loadu_ps(in); }
    static VectorType load1(const ScalarType in) { return _mm_load1_ps(&in); }
    static void store(ScalarType* out, VectorType val) { _mm_storeu_ps(out, val); }
    static VectorType multiply(VectorType val, VectorType scale) { return _mm_mul_ps(val, scale); }
    static VectorType divide(VectorType val, VectorType scale) { return _mm_div_ps(val, scale); }
    template <RoundingMode mode>
    static VectorType apply(VectorType val) {
        return _mm_round_ps(val, int(mode));
    }

    static VectorType prepare(size_t scale) { return load1(scale); }
};

template <>
class BaseFloatRoundingComputation<Float64> {
public:
    using ScalarType = Float64;
    using VectorType = __m128d;
    static const size_t data_count = 2;

    static VectorType load(const ScalarType* in) { return _mm_loadu_pd(in); }
    static VectorType load1(const ScalarType in) { return _mm_load1_pd(&in); }
    static void store(ScalarType* out, VectorType val) { _mm_storeu_pd(out, val); }
    static VectorType multiply(VectorType val, VectorType scale) { return _mm_mul_pd(val, scale); }
    static VectorType divide(VectorType val, VectorType scale) { return _mm_div_pd(val, scale); }
    template <RoundingMode mode>
    static VectorType apply(VectorType val) {
        return _mm_round_pd(val, int(mode));
    }

    static VectorType prepare(size_t scale) { return load1(scale); }
};

#else

/// Implementation for ARM. Not vectorized.

inline float roundWithMode(float x, RoundingMode mode) {
    switch (mode) {
    case RoundingMode::Round:
        return nearbyintf(x);
    case RoundingMode::Floor:
        return floorf(x);
    case RoundingMode::Ceil:
        return ceilf(x);
    case RoundingMode::Trunc:
        return truncf(x);
    }

    __builtin_unreachable();
}

inline double roundWithMode(double x, RoundingMode mode) {
    switch (mode) {
    case RoundingMode::Round:
        return nearbyint(x);
    case RoundingMode::Floor:
        return floor(x);
    case RoundingMode::Ceil:
        return ceil(x);
    case RoundingMode::Trunc:
        return trunc(x);
    }

    __builtin_unreachable();
}

template <typename T>
class BaseFloatRoundingComputation {
public:
    using ScalarType = T;
    using VectorType = T;
    static const size_t data_count = 1;

    static VectorType load(const ScalarType* in) { return *in; }
    static VectorType load1(const ScalarType in) { return in; }
    static VectorType store(ScalarType* out, ScalarType val) { return *out = val; }
    static VectorType multiply(VectorType val, VectorType scale) { return val * scale; }
    static VectorType divide(VectorType val, VectorType scale) { return val / scale; }
    template <RoundingMode mode>
    static VectorType apply(VectorType val) {
        return roundWithMode(val, mode);
    }

    static VectorType prepare(size_t scale) { return load1(scale); }
};

#endif

/** Implementation of low-level round-off functions for floating-point values.
  */
template <typename T, RoundingMode rounding_mode, ScaleMode scale_mode>
class FloatRoundingComputation : public BaseFloatRoundingComputation<T> {
    using Base = BaseFloatRoundingComputation<T>;

public:
    static inline void compute(const T* __restrict in, const typename Base::VectorType& scale,
                               T* __restrict out) {
        auto val = Base::load(in);

        if (scale_mode == ScaleMode::Positive) {
            val = Base::multiply(val, scale);
        } else if (scale_mode == ScaleMode::Negative) {
            val = Base::divide(val, scale);
        }

        val = Base::template apply<rounding_mode>(val);

        if (scale_mode == ScaleMode::Positive) {
            val = Base::divide(val, scale);
        } else if (scale_mode == ScaleMode::Negative) {
            val = Base::multiply(val, scale);
        }

        Base::store(out, val);
    }
};

/** Implementing high-level rounding functions.
  */
template <typename T, RoundingMode rounding_mode, ScaleMode scale_mode>
struct FloatRoundingImpl {
private:
    static_assert(!IsDecimalNumber<T>);

    using Op = FloatRoundingComputation<T, rounding_mode, scale_mode>;
    using Data = std::array<T, Op::data_count>;
    using ColumnType = ColumnVector<T>;
    using Container = typename ColumnType::Container;

public:
    static NO_INLINE void apply(const Container& in, size_t scale, Container& out) {
        auto mm_scale = Op::prepare(scale);

        const size_t data_count = std::tuple_size<Data>();

        const T* end_in = in.data() + in.size();
        const T* limit = in.data() + in.size() / data_count * data_count;

        const T* __restrict p_in = in.data();
        T* __restrict p_out = out.data();

        while (p_in < limit) {
            Op::compute(p_in, mm_scale, p_out);
            p_in += data_count;
            p_out += data_count;
        }

        if (p_in < end_in) {
            Data tmp_src {{}};
            Data tmp_dst;

            size_t tail_size_bytes = (end_in - p_in) * sizeof(*p_in);

            memcpy(&tmp_src, p_in, tail_size_bytes);
            Op::compute(reinterpret_cast<T*>(&tmp_src), mm_scale, reinterpret_cast<T*>(&tmp_dst));
            memcpy(p_out, &tmp_dst, tail_size_bytes);
        }
    }
};

template <typename T, RoundingMode rounding_mode, ScaleMode scale_mode,
          TieBreakingMode tie_breaking_mode>
struct IntegerRoundingImpl {
private:
    using Op = IntegerRoundingComputation<T, rounding_mode, scale_mode, tie_breaking_mode>;
    using Container = typename ColumnVector<T>::Container;

public:
    template <size_t scale>
    static NO_INLINE void applyImpl(const Container& in, Container& out) {
        const T* end_in = in.data() + in.size();

        const T* __restrict p_in = in.data();
        T* __restrict p_out = out.data();

        while (p_in < end_in) {
            Op::compute(p_in, scale, p_out);
            ++p_in;
            ++p_out;
        }
    }

    static NO_INLINE void apply(const Container& in, size_t scale, Container& out) {
        /// Manual function cloning for compiler to generate integer division by constant.
        switch (scale) {
        case 1ULL:
            return applyImpl<1ULL>(in, out);
        case 10ULL:
            return applyImpl<10ULL>(in, out);
        case 100ULL:
            return applyImpl<100ULL>(in, out);
        case 1000ULL:
            return applyImpl<1000ULL>(in, out);
        case 10000ULL:
            return applyImpl<10000ULL>(in, out);
        case 100000ULL:
            return applyImpl<100000ULL>(in, out);
        case 1000000ULL:
            return applyImpl<1000000ULL>(in, out);
        case 10000000ULL:
            return applyImpl<10000000ULL>(in, out);
        case 100000000ULL:
            return applyImpl<100000000ULL>(in, out);
        case 1000000000ULL:
            return applyImpl<1000000000ULL>(in, out);
        case 10000000000ULL:
            return applyImpl<10000000000ULL>(in, out);
        case 100000000000ULL:
            return applyImpl<100000000000ULL>(in, out);
        case 1000000000000ULL:
            return applyImpl<1000000000000ULL>(in, out);
        case 10000000000000ULL:
            return applyImpl<10000000000000ULL>(in, out);
        case 100000000000000ULL:
            return applyImpl<100000000000000ULL>(in, out);
        case 1000000000000000ULL:
            return applyImpl<1000000000000000ULL>(in, out);
        case 10000000000000000ULL:
            return applyImpl<10000000000000000ULL>(in, out);
        case 100000000000000000ULL:
            return applyImpl<100000000000000000ULL>(in, out);
        case 1000000000000000000ULL:
            return applyImpl<1000000000000000000ULL>(in, out);
        case 10000000000000000000ULL:
            return applyImpl<10000000000000000000ULL>(in, out);
        default:
            __builtin_unreachable();
        }
    }
};

/** Select the appropriate processing algorithm depending on the scale.
  */
template <typename T, RoundingMode rounding_mode, TieBreakingMode tie_breaking_mode>
struct Dispatcher {
    template <ScaleMode scale_mode>
    using FunctionRoundingImpl = std::conditional_t<
            IsDecimalNumber<T>, DecimalRoundingImpl<T, rounding_mode, tie_breaking_mode>,
            std::conditional_t<
                    std::is_floating_point_v<T>, FloatRoundingImpl<T, rounding_mode, scale_mode>,
                    IntegerRoundingImpl<T, rounding_mode, scale_mode, tie_breaking_mode>>>;

    static ColumnPtr apply(const IColumn* col_general, Int16 scale_arg) {
        if constexpr (IsNumber<T>) {
            const auto* const col = check_and_get_column<ColumnVector<T>>(col_general);
            auto col_res = ColumnVector<T>::create();

            typename ColumnVector<T>::Container& vec_res = col_res->get_data();
            vec_res.resize(col->get_data().size());

            if (!vec_res.empty()) {
                if (scale_arg == 0) {
                    size_t scale = 1;
                    FunctionRoundingImpl<ScaleMode::Zero>::apply(col->get_data(), scale, vec_res);
                } else if (scale_arg > 0) {
                    size_t scale = int_exp10(scale_arg);
                    FunctionRoundingImpl<ScaleMode::Positive>::apply(col->get_data(), scale,
                                                                     vec_res);
                } else {
                    size_t scale = int_exp10(-scale_arg);
                    FunctionRoundingImpl<ScaleMode::Negative>::apply(col->get_data(), scale,
                                                                     vec_res);
                }
            }

            return col_res;
        } else if constexpr (IsDecimalNumber<T>) {
            const auto* const decimal_col = check_and_get_column<ColumnDecimal<T>>(col_general);
            const auto& vec_src = decimal_col->get_data();

            auto col_res = ColumnDecimal<T>::create(vec_src.size(), decimal_col->get_scale());
            auto& vec_res = col_res->get_data();

            if (!vec_res.empty()) {
                FunctionRoundingImpl<ScaleMode::Negative>::apply(
                        decimal_col->get_data(), decimal_col->get_scale(), vec_res, scale_arg);
            }

            return col_res;
        } else {
            __builtin_unreachable();
            return nullptr;
        }
    }
};

template <typename Impl, RoundingMode rounding_mode, TieBreakingMode tie_breaking_mode>
class FunctionRounding : public IFunction {
public:
    static constexpr auto name = Impl::name;
    static FunctionPtr create() { return std::make_shared<FunctionRounding>(); }

    String get_name() const override { return name; }

    bool is_variadic() const override { return true; }
    size_t get_number_of_arguments() const override { return 0; }

    DataTypes get_variadic_argument_types_impl() const override {
        return Impl::get_variadic_argument_types();
    }

    /// Get result types by argument types. If the function does not apply to these arguments, throw an exception.
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
        if ((arguments.empty()) || (arguments.size() > 2)) {
            LOG(FATAL) << "Number of arguments for function " + get_name() +
                                  " doesn't match: should be 1 or 2. ";
        }

        return arguments[0];
    }

    static Status get_scale_arg(const ColumnWithTypeAndName& arguments, Int16* scale) {
        const IColumn& scale_column = *arguments.column;
        if (!is_column_const(scale_column)) {
            return Status::InvalidArgument("2nd argument for function {} should be constant", name);
        }

        Field scale_field = assert_cast<const ColumnConst&>(scale_column).get_field();

        Int64 scale64 = scale_field.get<Int64>();
        if (scale64 > std::numeric_limits<Int16>::max() ||
            scale64 < std::numeric_limits<Int16>::min()) {
            return Status::InvalidArgument("Scale argument for function {} is too large: {}", name,
                                           scale64);
        }

        *scale = scale64;
        return Status::OK();
    }

    bool use_default_implementation_for_constants() const override { return true; }
    ColumnNumbers get_arguments_that_are_always_constant() const override { return {1}; }

    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
                        size_t result, size_t /*input_rows_count*/) override {
        const ColumnWithTypeAndName& column = block.get_by_position(arguments[0]);
        Int16 scale_arg = 0;
        if (arguments.size() == 2) {
            RETURN_IF_ERROR(get_scale_arg(block.get_by_position(arguments[1]), &scale_arg));
        }

        ColumnPtr res;
        auto call = [&](const auto& types) -> bool {
            using Types = std::decay_t<decltype(types)>;
            using DataType = typename Types::LeftType;

            if constexpr (IsDataTypeNumber<DataType> || IsDataTypeDecimal<DataType>) {
                using FieldType = typename DataType::FieldType;
                res = Dispatcher<FieldType, rounding_mode, tie_breaking_mode>::apply(
                        column.column.get(), scale_arg);
                return true;
            }
            return false;
        };

#if !defined(__SSE4_1__)
        /// In case of "nearbyint" function is used, we should ensure the expected rounding mode for the Banker's rounding.
        /// Actually it is by default. But we will set it just in case.

        if constexpr (rounding_mode == RoundingMode::Round) {
            if (0 != fesetround(FE_TONEAREST)) {
                return Status::InvalidArgument("Cannot set floating point rounding mode");
            }
        }
#endif

        if (!call_on_index_and_data_type<void>(column.type->get_type_id(), call)) {
            return Status::InvalidArgument("Invalid argument type {} for function {}",
                                           column.type->get_name(), name);
        }

        block.replace_by_position(result, std::move(res));
        return Status::OK();
    }
};

} // namespace doris::vectorized