[Enhance] use fast_float::from_chars to do str cast to float/double to avoid lose precision (#16190)

This commit is contained in:
HappenLee
2023-02-01 23:53:34 +08:00
committed by GitHub
parent 40d9e19e1d
commit 7c145faa80
2 changed files with 40 additions and 109 deletions

View File

@ -20,6 +20,8 @@
#pragma once
#include <fast_float/fast_float.h>
#include <cmath>
#include <cstdint>
#include <cstring>
@ -111,13 +113,7 @@ public:
template <typename T>
static inline T string_to_float(const char* s, int len, ParseResult* result) {
T ans = string_to_float_internal<T>(s, len, result);
if (LIKELY(*result == PARSE_SUCCESS)) {
return ans;
}
int i = skip_leading_whitespace(s, len);
return string_to_float_internal<T>(s + i, len - i, result);
return string_to_float_internal<T>(s, len, result);
}
// Parses a string for 'true' or 'false', case insensitive.
@ -425,118 +421,54 @@ inline T StringParser::string_to_int_no_overflow(const char* s, int len, ParseRe
template <typename T>
inline T StringParser::string_to_float_internal(const char* s, int len, ParseResult* result) {
if (UNLIKELY(len <= 0)) {
int i = 0;
// skip leading spaces
for (; i < len; ++i) {
if (!is_whitespace(s[i])) {
break;
}
}
// skip back spaces
int j = len - 1;
for (; j >= i; j--) {
if (!is_whitespace(s[j])) {
break;
}
}
// skip leading '+', from_chars can handle '-'
if (i < len && s[i] == '+') {
i++;
}
if (UNLIKELY(i > j)) {
*result = PARSE_FAILURE;
return 0;
}
// Use double here to not lose precision while accumulating the result
double val = 0;
bool negative = false;
int i = 0;
double divide = 1;
bool decimal = false;
int64_t remainder = 0;
// The number of 'significant figures' we've encountered so far (i.e., digits excluding
// leading 0s). This technically shouldn't count trailing 0s either, but for us it
// doesn't matter if we count them based on the implementation below.
int sig_figs = 0;
auto res = fast_float::from_chars(s + i, s + j + 1, val);
switch (*s) {
case '-':
negative = true;
case '+':
i = 1;
}
int first = i;
for (; i < len; ++i) {
if (LIKELY(s[i] >= '0' && s[i] <= '9')) {
if (s[i] != '0' || sig_figs > 0) {
++sig_figs;
}
if (decimal) {
// According to the IEEE floating-point spec, a double has up to 15-17
// significant decimal digits (see
// http://en.wikipedia.org/wiki/Double-precision_floating-point_format). We stop
// processing digits after we've already seen at least 18 sig figs to avoid
// overflowing 'remainder' (we stop after 18 instead of 17 to get the rounding
// right).
if (sig_figs <= 18) {
remainder = remainder * 10 + s[i] - '0';
divide *= 10;
if (res.ec == std::errc() && res.ptr == s + j + 1) {
if (abs(val) == std::numeric_limits<T>::infinity()) {
auto contain_inf = false;
for (int k = i; k < j + 1; k++) {
if (s[k] == 'i' || s[k] == 'I') {
contain_inf = true;
break;
}
} else {
val = val * 10 + s[i] - '0';
}
} else if (s[i] == '.') {
decimal = true;
} else if (s[i] == 'e' || s[i] == 'E') {
break;
} else if (s[i] == 'i' || s[i] == 'I') {
if (len > i + 2 && (s[i + 1] == 'n' || s[i + 1] == 'N') &&
(s[i + 2] == 'f' || s[i + 2] == 'F')) {
// Note: Hive writes inf as Infinity, at least for text. We'll be a little loose
// here and interpret any column with inf as a prefix as infinity rather than
// checking every remaining byte.
*result = PARSE_SUCCESS;
return negative ? -INFINITY : INFINITY;
} else {
// Starts with 'i', but isn't inf...
*result = PARSE_FAILURE;
return 0;
}
} else if (s[i] == 'n' || s[i] == 'N') {
if (len > i + 2 && (s[i + 1] == 'a' || s[i + 1] == 'A') &&
(s[i + 2] == 'n' || s[i + 2] == 'N')) {
*result = PARSE_SUCCESS;
return negative ? -NAN : NAN;
} else {
// Starts with 'n', but isn't NaN...
*result = PARSE_FAILURE;
return 0;
}
*result = contain_inf ? PARSE_SUCCESS : PARSE_OVERFLOW;
} else {
if ((UNLIKELY(i == first || !is_all_whitespace(s + i, len - i)))) {
// Reject the string because either the first char was not a digit, "," or "e",
// or the remaining chars are not all whitespace
*result = PARSE_FAILURE;
return 0;
}
// skip trailing whitespace.
break;
*result = PARSE_SUCCESS;
}
}
val += remainder / divide;
if (i < len && (s[i] == 'e' || s[i] == 'E')) {
// Create a C-string from s starting after the optional '-' sign and fall back to
// strtod to avoid conversion inaccuracy for scientific notation.
// Do not use boost::lexical_cast because it causes codegen to crash for an
// unknown reason (exception handling?).
char c_str[len - negative + 1];
memcpy(c_str, s + negative, len - negative);
c_str[len - negative] = '\0';
char* s_end;
val = strtod(c_str, &s_end);
if (s_end != c_str + len - negative) {
// skip trailing whitespace
int trailing_len = len - negative - (int)(s_end - c_str);
if (UNLIKELY(!is_all_whitespace(s_end, trailing_len))) {
*result = PARSE_FAILURE;
return val;
}
}
}
// Determine if it is an overflow case and update the result
if (UNLIKELY(val == std::numeric_limits<T>::infinity())) {
*result = PARSE_OVERFLOW;
return val;
} else {
*result = PARSE_SUCCESS;
*result = PARSE_FAILURE;
}
return (T)(negative ? -val : val);
return 0;
}
inline bool StringParser::string_to_bool_internal(const char* s, int len, ParseResult* result) {

View File

@ -441,7 +441,6 @@ TEST(StringToFloat, Basic) {
std::string double_max = boost::lexical_cast<std::string>(std::numeric_limits<double>::max());
test_float_value<double>(double_min, StringParser::PARSE_SUCCESS);
test_float_value<double>(double_max, StringParser::PARSE_SUCCESS);
// Non-finite values
test_all_float_variants("INFinity", StringParser::PARSE_SUCCESS);
test_all_float_variants("infinity", StringParser::PARSE_SUCCESS);
@ -451,8 +450,8 @@ TEST(StringToFloat, Basic) {
test_float_value_is_nan<double>("nan", StringParser::PARSE_SUCCESS);
test_float_value_is_nan<float>("NaN", StringParser::PARSE_SUCCESS);
test_float_value_is_nan<double>("NaN", StringParser::PARSE_SUCCESS);
test_float_value_is_nan<float>("nana", StringParser::PARSE_SUCCESS);
test_float_value_is_nan<double>("nana", StringParser::PARSE_SUCCESS);
test_float_value_is_nan<float>("nana", StringParser::PARSE_FAILURE);
test_float_value_is_nan<double>("nana", StringParser::PARSE_FAILURE);
test_float_value_is_nan<float>("naN", StringParser::PARSE_SUCCESS);
test_float_value_is_nan<double>("naN", StringParser::PARSE_SUCCESS);