[Enhance] use fast_float::from_chars to do str cast to float/double to avoid lose precision (#16190)

2023-02-01 23:53:34 +08:00
parent 40d9e19e1d
commit 7c145faa80
2 changed files with 40 additions and 109 deletions
--- a/be/src/util/string_parser.hpp
+++ b/be/src/util/string_parser.hpp
@ -20,6 +20,8 @@

 #pragma once

+#include <fast_float/fast_float.h>
+
 #include <cmath>
 #include <cstdint>
 #include <cstring>
@ -111,13 +113,7 @@ public:

    template <typename T>
    static inline T string_to_float(const char* s, int len, ParseResult* result) {
-        T ans = string_to_float_internal<T>(s, len, result);
-        if (LIKELY(*result == PARSE_SUCCESS)) {
-            return ans;
-        }
-
-        int i = skip_leading_whitespace(s, len);
-        return string_to_float_internal<T>(s + i, len - i, result);
+        return string_to_float_internal<T>(s, len, result);
    }

    // Parses a string for 'true' or 'false', case insensitive.
@ -425,118 +421,54 @@ inline T StringParser::string_to_int_no_overflow(const char* s, int len, ParseRe

 template <typename T>
 inline T StringParser::string_to_float_internal(const char* s, int len, ParseResult* result) {
-    if (UNLIKELY(len <= 0)) {
+    int i = 0;
+    // skip leading spaces
+    for (; i < len; ++i) {
+        if (!is_whitespace(s[i])) {
+            break;
+        }
+    }
+
+    // skip back spaces
+    int j = len - 1;
+    for (; j >= i; j--) {
+        if (!is_whitespace(s[j])) {
+            break;
+        }
+    }
+
+    // skip leading '+', from_chars can handle '-'
+    if (i < len && s[i] == '+') {
+        i++;
+    }
+    if (UNLIKELY(i > j)) {
        *result = PARSE_FAILURE;
        return 0;
    }

    // Use double here to not lose precision while accumulating the result
    double val = 0;
-    bool negative = false;
-    int i = 0;
-    double divide = 1;
-    bool decimal = false;
-    int64_t remainder = 0;
-    // The number of 'significant figures' we've encountered so far (i.e., digits excluding
-    // leading 0s). This technically shouldn't count trailing 0s either, but for us it
-    // doesn't matter if we count them based on the implementation below.
-    int sig_figs = 0;
+    auto res = fast_float::from_chars(s + i, s + j + 1, val);

-    switch (*s) {
-    case '-':
-        negative = true;
-    case '+':
-        i = 1;
-    }
-
-    int first = i;
-    for (; i < len; ++i) {
-        if (LIKELY(s[i] >= '0' && s[i] <= '9')) {
-            if (s[i] != '0' || sig_figs > 0) {
-                ++sig_figs;
-            }
-            if (decimal) {
-                // According to the IEEE floating-point spec, a double has up to 15-17
-                // significant decimal digits (see
-                // http://en.wikipedia.org/wiki/Double-precision_floating-point_format). We stop
-                // processing digits after we've already seen at least 18 sig figs to avoid
-                // overflowing 'remainder' (we stop after 18 instead of 17 to get the rounding
-                // right).
-                if (sig_figs <= 18) {
-                    remainder = remainder * 10 + s[i] - '0';
-                    divide *= 10;
+    if (res.ec == std::errc() && res.ptr == s + j + 1) {
+        if (abs(val) == std::numeric_limits<T>::infinity()) {
+            auto contain_inf = false;
+            for (int k = i; k < j + 1; k++) {
+                if (s[k] == 'i' || s[k] == 'I') {
+                    contain_inf = true;
+                    break;
                }
-            } else {
-                val = val * 10 + s[i] - '0';
-            }
-        } else if (s[i] == '.') {
-            decimal = true;
-        } else if (s[i] == 'e' || s[i] == 'E') {
-            break;
-        } else if (s[i] == 'i' || s[i] == 'I') {
-            if (len > i + 2 && (s[i + 1] == 'n' || s[i + 1] == 'N') &&
-                (s[i + 2] == 'f' || s[i + 2] == 'F')) {
-                // Note: Hive writes inf as Infinity, at least for text. We'll be a little loose
-                // here and interpret any column with inf as a prefix as infinity rather than
-                // checking every remaining byte.
-                *result = PARSE_SUCCESS;
-                return negative ? -INFINITY : INFINITY;
-            } else {
-                // Starts with 'i', but isn't inf...
-                *result = PARSE_FAILURE;
-                return 0;
-            }
-        } else if (s[i] == 'n' || s[i] == 'N') {
-            if (len > i + 2 && (s[i + 1] == 'a' || s[i + 1] == 'A') &&
-                (s[i + 2] == 'n' || s[i + 2] == 'N')) {
-                *result = PARSE_SUCCESS;
-                return negative ? -NAN : NAN;
-            } else {
-                // Starts with 'n', but isn't NaN...
-                *result = PARSE_FAILURE;
-                return 0;
            }
+
+            *result = contain_inf ? PARSE_SUCCESS : PARSE_OVERFLOW;
        } else {
-            if ((UNLIKELY(i == first || !is_all_whitespace(s + i, len - i)))) {
-                // Reject the string because either the first char was not a digit, "," or "e",
-                // or the remaining chars are not all whitespace
-                *result = PARSE_FAILURE;
-                return 0;
-            }
-            // skip trailing whitespace.
-            break;
+            *result = PARSE_SUCCESS;
        }
-    }
-
-    val += remainder / divide;
-
-    if (i < len && (s[i] == 'e' || s[i] == 'E')) {
-        // Create a C-string from s starting after the optional '-' sign and fall back to
-        // strtod to avoid conversion inaccuracy for scientific notation.
-        // Do not use boost::lexical_cast because it causes codegen to crash for an
-        // unknown reason (exception handling?).
-        char c_str[len - negative + 1];
-        memcpy(c_str, s + negative, len - negative);
-        c_str[len - negative] = '\0';
-        char* s_end;
-        val = strtod(c_str, &s_end);
-        if (s_end != c_str + len - negative) {
-            // skip trailing whitespace
-            int trailing_len = len - negative - (int)(s_end - c_str);
-            if (UNLIKELY(!is_all_whitespace(s_end, trailing_len))) {
-                *result = PARSE_FAILURE;
-                return val;
-            }
-        }
-    }
-
-    // Determine if it is an overflow case and update the result
-    if (UNLIKELY(val == std::numeric_limits<T>::infinity())) {
-        *result = PARSE_OVERFLOW;
+        return val;
    } else {
-        *result = PARSE_SUCCESS;
+        *result = PARSE_FAILURE;
    }
-    return (T)(negative ? -val : val);
+    return 0;
 }

 inline bool StringParser::string_to_bool_internal(const char* s, int len, ParseResult* result) {
--- a/be/test/util/string_parser_test.cpp
+++ b/be/test/util/string_parser_test.cpp
@ -441,7 +441,6 @@ TEST(StringToFloat, Basic) {
    std::string double_max = boost::lexical_cast<std::string>(std::numeric_limits<double>::max());
    test_float_value<double>(double_min, StringParser::PARSE_SUCCESS);
    test_float_value<double>(double_max, StringParser::PARSE_SUCCESS);
-
    // Non-finite values
    test_all_float_variants("INFinity", StringParser::PARSE_SUCCESS);
    test_all_float_variants("infinity", StringParser::PARSE_SUCCESS);
@ -451,8 +450,8 @@ TEST(StringToFloat, Basic) {
    test_float_value_is_nan<double>("nan", StringParser::PARSE_SUCCESS);
    test_float_value_is_nan<float>("NaN", StringParser::PARSE_SUCCESS);
    test_float_value_is_nan<double>("NaN", StringParser::PARSE_SUCCESS);
-    test_float_value_is_nan<float>("nana", StringParser::PARSE_SUCCESS);
-    test_float_value_is_nan<double>("nana", StringParser::PARSE_SUCCESS);
+    test_float_value_is_nan<float>("nana", StringParser::PARSE_FAILURE);
+    test_float_value_is_nan<double>("nana", StringParser::PARSE_FAILURE);
    test_float_value_is_nan<float>("naN", StringParser::PARSE_SUCCESS);
    test_float_value_is_nan<double>("naN", StringParser::PARSE_SUCCESS);