// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. // This file is copied from // https://github.com/apache/impala/blob/branch-2.9.0/be/src/util/string-parser.hpp // and modified by Doris #pragma once #include #include #include #include // IWYU pragma: no_include #include // IWYU pragma: keep #include #include #include #include #include #include #include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/status.h" #include "runtime/large_int_value.h" #include "runtime/primitive_type.h" #include "vec/common/int_exp.h" #include "vec/core/extended_types.h" #include "vec/core/wide_integer.h" #include "vec/data_types/data_type_decimal.h" #include "vec/data_types/number_traits.h" namespace doris { namespace vectorized { template struct Decimal; } // namespace vectorized // Utility functions for doing atoi/atof on non-null terminated strings. On micro benchmarks, // this is significantly faster than libc (atoi/strtol and atof/strtod). // // Strings with leading and trailing whitespaces are accepted. // Branching is heavily optimized for the non-whitespace successful case. // All the StringTo* functions first parse the input string assuming it has no leading whitespace. // If that first attempt was unsuccessful, these functions retry the parsing after removing // whitespace. Therefore, strings with whitespace take a perf hit on branch mis-prediction. // // For overflows, we are following the mysql behavior, to cap values at the max/min value for that // data type. This is different from hive, which returns NULL for overflow slots for int types // and inf/-inf for float types. // // Things we tried that did not work: // - lookup table for converting character to digit // Improvements (TODO): // - Validate input using _sidd_compare_ranges // - Since we know the length, we can parallelize this: i.e. result = 100*s[0] + 10*s[1] + s[2] class StringParser { public: enum ParseResult { PARSE_SUCCESS = 0, PARSE_FAILURE, PARSE_OVERFLOW, PARSE_UNDERFLOW }; template static T numeric_limits(bool negative) { if constexpr (std::is_same_v) { return negative ? MIN_INT128 : MAX_INT128; } else { return negative ? std::numeric_limits::min() : std::numeric_limits::max(); } } template static T get_scale_multiplier(int scale) { static_assert(std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v, "You can only instantiate as int32_t, int64_t, __int128."); if constexpr (std::is_same_v) { return common::exp10_i32(scale); } else if constexpr (std::is_same_v) { return common::exp10_i64(scale); } else if constexpr (std::is_same_v) { return common::exp10_i128(scale); } else if constexpr (std::is_same_v) { return common::exp10_i256(scale); } } // This is considerably faster than glibc's implementation (25x). // In the case of overflow, the max/min value for the data type will be returned. // Assumes s represents a decimal number. template static inline T string_to_int(const char* __restrict s, int len, ParseResult* result) { T ans = string_to_int_internal(s, len, result); if (LIKELY(*result == PARSE_SUCCESS)) { return ans; } int i = skip_leading_whitespace(s, len); return string_to_int_internal(s + i, len - i, result); } // This is considerably faster than glibc's implementation. // In the case of overflow, the max/min value for the data type will be returned. // Assumes s represents a decimal number. template static inline T string_to_unsigned_int(const char* __restrict s, int len, ParseResult* result) { T ans = string_to_unsigned_int_internal(s, len, result); if (LIKELY(*result == PARSE_SUCCESS)) { return ans; } int i = skip_leading_whitespace(s, len); return string_to_unsigned_int_internal(s + i, len - i, result); } // Convert a string s representing a number in given base into a decimal number. template static inline T string_to_int(const char* __restrict s, int len, int base, ParseResult* result) { T ans = string_to_int_internal(s, len, base, result); if (LIKELY(*result == PARSE_SUCCESS)) { return ans; } int i = skip_leading_whitespace(s, len); return string_to_int_internal(s + i, len - i, base, result); } template static inline T string_to_float(const char* __restrict s, int len, ParseResult* result) { return string_to_float_internal(s, len, result); } // Parses a string for 'true' or 'false', case insensitive. static inline bool string_to_bool(const char* __restrict s, int len, ParseResult* result) { bool ans = string_to_bool_internal(s, len, result); if (LIKELY(*result == PARSE_SUCCESS)) { return ans; } int i = skip_leading_whitespace(s, len); return string_to_bool_internal(s + i, len - i, result); } template ::CppType::NativeType, typename DecimalType = PrimitiveTypeTraits

::ColumnType::value_type> static inline T string_to_decimal(const char* __restrict s, int len, int type_precision, int type_scale, ParseResult* result); template static Status split_string_to_map(const std::string& base, const T element_separator, const T key_value_separator, std::map* result) { int key_pos = 0; int key_end; int val_pos; int val_end; while ((key_end = base.find(key_value_separator, key_pos)) != std::string::npos) { if ((val_pos = base.find_first_not_of(key_value_separator, key_end)) == std::string::npos) { break; } if ((val_end = base.find(element_separator, val_pos)) == std::string::npos) { val_end = base.size(); } result->insert(std::make_pair(base.substr(key_pos, key_end - key_pos), base.substr(val_pos, val_end - val_pos))); key_pos = val_end; if (key_pos != std::string::npos) { ++key_pos; } } return Status::OK(); } private: // This is considerably faster than glibc's implementation. // In the case of overflow, the max/min value for the data type will be returned. // Assumes s represents a decimal number. // Return PARSE_FAILURE on leading whitespace. Trailing whitespace is allowed. template static inline T string_to_int_internal(const char* __restrict s, int len, ParseResult* result); // This is considerably faster than glibc's implementation. // In the case of overflow, the max/min value for the data type will be returned. // Assumes s represents a decimal number. // Return PARSE_FAILURE on leading whitespace. Trailing whitespace is allowed. template static inline T string_to_unsigned_int_internal(const char* __restrict s, int len, ParseResult* result); // Convert a string s representing a number in given base into a decimal number. // Return PARSE_FAILURE on leading whitespace. Trailing whitespace is allowed. template static inline T string_to_int_internal(const char* __restrict s, int len, int base, ParseResult* result); // Converts an ascii string to an integer of type T assuming it cannot overflow // and the number is positive. // Leading whitespace is not allowed. Trailing whitespace will be skipped. template static inline T string_to_int_no_overflow(const char* __restrict s, int len, ParseResult* result); // This is considerably faster than glibc's implementation (>100x why???) // No special case handling needs to be done for overflows, the floating point spec // already does it and will cap the values to -inf/inf // To avoid inaccurate conversions this function falls back to strtod for // scientific notation. // Return PARSE_FAILURE on leading whitespace. Trailing whitespace is allowed. // TODO: Investigate using intrinsics to speed up the slow strtod path. template static inline T string_to_float_internal(const char* __restrict s, int len, ParseResult* result); // parses a string for 'true' or 'false', case insensitive // Return PARSE_FAILURE on leading whitespace. Trailing whitespace is allowed. static inline bool string_to_bool_internal(const char* __restrict s, int len, ParseResult* result); // Returns true if s only contains whitespace. static inline bool is_all_whitespace(const char* __restrict s, int len) { for (int i = 0; i < len; ++i) { if (!LIKELY(is_whitespace(s[i]))) { return false; } } return true; } // For strings like "3.0", "3.123", and "3.", can parse them as 3. static inline bool is_float_suffix(const char* __restrict s, int len) { return (s[0] == '.' && is_all_digit(s + 1, len - 1)); } static inline bool is_all_digit(const char* __restrict s, int len) { for (int i = 0; i < len; ++i) { if (!LIKELY(s[i] >= '0' && s[i] <= '9')) { return false; } } return true; } // Returns the position of the first non-whitespace character in s. static inline int skip_leading_whitespace(const char* __restrict s, int len) { int i = 0; while (i < len && is_whitespace(s[i])) { ++i; } return i; } // Our own definition of "isspace" that optimize on the ' ' branch. static inline bool is_whitespace(const char& c) { return LIKELY(c == ' ') || UNLIKELY(c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r'); } }; // end of class StringParser template T StringParser::string_to_int_internal(const char* __restrict s, int len, ParseResult* result) { if (UNLIKELY(len <= 0)) { *result = PARSE_FAILURE; return 0; } typedef typename std::make_unsigned::type UnsignedT; UnsignedT val = 0; UnsignedT max_val = StringParser::numeric_limits(false); bool negative = false; int i = 0; switch (*s) { case '-': negative = true; max_val += 1; [[fallthrough]]; case '+': ++i; // only one '+'/'-' char, so could return failure directly if (UNLIKELY(len == 1)) { *result = PARSE_FAILURE; return 0; } } // This is the fast path where the string cannot overflow. if (LIKELY(len - i < vectorized::NumberTraits::max_ascii_len())) { val = string_to_int_no_overflow(s + i, len - i, result); return static_cast(negative ? -val : val); } const T max_div_10 = max_val / 10; const T max_mod_10 = max_val % 10; int first = i; for (; i < len; ++i) { if (LIKELY(s[i] >= '0' && s[i] <= '9')) { T digit = s[i] - '0'; // This is a tricky check to see if adding this digit will cause an overflow. if (UNLIKELY(val > (max_div_10 - (digit > max_mod_10)))) { *result = PARSE_OVERFLOW; return negative ? -max_val : max_val; } val = val * 10 + digit; } else { if ((UNLIKELY(i == first || (!is_all_whitespace(s + i, len - i) && !is_float_suffix(s + i, len - i))))) { // Reject the string because either the first char was not a digit, // or the remaining chars are not all whitespace *result = PARSE_FAILURE; return 0; } // Returning here is slightly faster than breaking the loop. *result = PARSE_SUCCESS; return static_cast(negative ? -val : val); } } *result = PARSE_SUCCESS; return static_cast(negative ? -val : val); } template T StringParser::string_to_unsigned_int_internal(const char* __restrict s, int len, ParseResult* result) { if (UNLIKELY(len <= 0)) { *result = PARSE_FAILURE; return 0; } T val = 0; T max_val = std::numeric_limits::max(); int i = 0; typedef typename std::make_signed::type signedT; // This is the fast path where the string cannot overflow. if (LIKELY(len - i < vectorized::NumberTraits::max_ascii_len())) { val = string_to_int_no_overflow(s + i, len - i, result); return val; } const T max_div_10 = max_val / 10; const T max_mod_10 = max_val % 10; int first = i; for (; i < len; ++i) { if (LIKELY(s[i] >= '0' && s[i] <= '9')) { T digit = s[i] - '0'; // This is a tricky check to see if adding this digit will cause an overflow. if (UNLIKELY(val > (max_div_10 - (digit > max_mod_10)))) { *result = PARSE_OVERFLOW; return max_val; } val = val * 10 + digit; } else { if ((UNLIKELY(i == first || !is_all_whitespace(s + i, len - i)))) { // Reject the string because either the first char was not a digit, // or the remaining chars are not all whitespace *result = PARSE_FAILURE; return 0; } // Returning here is slightly faster than breaking the loop. *result = PARSE_SUCCESS; return val; } } *result = PARSE_SUCCESS; return val; } template T StringParser::string_to_int_internal(const char* __restrict s, int len, int base, ParseResult* result) { typedef typename std::make_unsigned::type UnsignedT; UnsignedT val = 0; UnsignedT max_val = StringParser::numeric_limits(false); bool negative = false; if (UNLIKELY(len <= 0)) { *result = PARSE_FAILURE; return 0; } int i = 0; switch (*s) { case '-': negative = true; max_val = StringParser::numeric_limits(false) + 1; [[fallthrough]]; case '+': i = 1; } const T max_div_base = max_val / base; const T max_mod_base = max_val % base; int first = i; for (; i < len; ++i) { T digit; if (LIKELY(s[i] >= '0' && s[i] <= '9')) { digit = s[i] - '0'; } else if (s[i] >= 'a' && s[i] <= 'z') { digit = (s[i] - 'a' + 10); } else if (s[i] >= 'A' && s[i] <= 'Z') { digit = (s[i] - 'A' + 10); } else { if ((UNLIKELY(i == first || !is_all_whitespace(s + i, len - i)))) { // Reject the string because either the first char was not an alpha/digit, // or the remaining chars are not all whitespace *result = PARSE_FAILURE; return 0; } // skip trailing whitespace. break; } // Bail, if we encounter a digit that is not available in base. if (digit >= base) { break; } // This is a tricky check to see if adding this digit will cause an overflow. if (UNLIKELY(val > (max_div_base - (digit > max_mod_base)))) { *result = PARSE_OVERFLOW; return static_cast(negative ? -max_val : max_val); } val = val * base + digit; } *result = PARSE_SUCCESS; return static_cast(negative ? -val : val); } template T StringParser::string_to_int_no_overflow(const char* __restrict s, int len, ParseResult* result) { T val = 0; if (UNLIKELY(len == 0)) { *result = PARSE_SUCCESS; return val; } // Factor out the first char for error handling speeds up the loop. if (LIKELY(s[0] >= '0' && s[0] <= '9')) { val = s[0] - '0'; } else { *result = PARSE_FAILURE; return 0; } for (int i = 1; i < len; ++i) { if (LIKELY(s[i] >= '0' && s[i] <= '9')) { T digit = s[i] - '0'; val = val * 10 + digit; } else { if ((UNLIKELY(!is_all_whitespace(s + i, len - i) && !is_float_suffix(s + i, len - i)))) { *result = PARSE_FAILURE; return 0; } *result = PARSE_SUCCESS; return val; } } *result = PARSE_SUCCESS; return val; } template T StringParser::string_to_float_internal(const char* __restrict s, int len, ParseResult* result) { int i = 0; // skip leading spaces for (; i < len; ++i) { if (!is_whitespace(s[i])) { break; } } // skip back spaces int j = len - 1; for (; j >= i; j--) { if (!is_whitespace(s[j])) { break; } } // skip leading '+', from_chars can handle '-' if (i < len && s[i] == '+') { i++; } if (UNLIKELY(i > j)) { *result = PARSE_FAILURE; return 0; } // Use double here to not lose precision while accumulating the result double val = 0; auto res = fast_float::from_chars(s + i, s + j + 1, val); if (res.ec == std::errc() && res.ptr == s + j + 1) { if (abs(val) == std::numeric_limits::infinity()) { auto contain_inf = false; for (int k = i; k < j + 1; k++) { if (s[k] == 'i' || s[k] == 'I') { contain_inf = true; break; } } *result = contain_inf ? PARSE_SUCCESS : PARSE_OVERFLOW; } else { *result = PARSE_SUCCESS; } return val; } else { *result = PARSE_FAILURE; } return 0; } inline bool StringParser::string_to_bool_internal(const char* __restrict s, int len, ParseResult* result) { *result = PARSE_SUCCESS; if (len >= 4 && (s[0] == 't' || s[0] == 'T')) { bool match = (s[1] == 'r' || s[1] == 'R') && (s[2] == 'u' || s[2] == 'U') && (s[3] == 'e' || s[3] == 'E'); if (match && LIKELY(is_all_whitespace(s + 4, len - 4))) { return true; } } else if (len >= 5 && (s[0] == 'f' || s[0] == 'F')) { bool match = (s[1] == 'a' || s[1] == 'A') && (s[2] == 'l' || s[2] == 'L') && (s[3] == 's' || s[3] == 'S') && (s[4] == 'e' || s[4] == 'E'); if (match && LIKELY(is_all_whitespace(s + 5, len - 5))) { return false; } } *result = PARSE_FAILURE; return false; } template T StringParser::string_to_decimal(const char* __restrict s, int len, int type_precision, int type_scale, ParseResult* result) { static_assert(std::is_same_v || std::is_same_v || std::is_same_v || std::is_same_v, "Cast string to decimal only support target type int32_t, int64_t, __int128 or " "wide::Int256."); // Special cases: // 1) '' == Fail, an empty string fails to parse. // 2) ' # ' == #, leading and trailing white space is ignored. // 3) '.' == 0, a single dot parses as zero (for consistency with other types). // 4) '#.' == '#', a trailing dot is ignored. // Ignore leading and trailing spaces. while (len > 0 && is_whitespace(*s)) { ++s; --len; } while (len > 0 && is_whitespace(s[len - 1])) { --len; } bool is_negative = false; if (len > 0) { switch (*s) { case '-': is_negative = true; [[fallthrough]]; case '+': ++s; --len; } } // Ignore leading zeros. bool found_value = false; while (len > 0 && UNLIKELY(*s == '0')) { found_value = true; ++s; --len; } // Ignore leading zeros even after a dot. This allows for differentiating between // cases like 0.01e2, which would fit in a DECIMAL(1, 0), and 0.10e2, which would // overflow. int scale = 0; int found_dot = 0; if (len > 0 && *s == '.') { found_dot = 1; ++s; --len; while (len > 0 && UNLIKELY(*s == '0')) { found_value = true; ++scale; ++s; --len; } } int precision = 0; int max_digit = type_precision - type_scale; int cur_digit = 0; bool found_exponent = false; int8_t exponent = 0; T value = 0; bool has_round = false; for (int i = 0; i < len; ++i) { const char& c = s[i]; if (LIKELY('0' <= c && c <= '9')) { found_value = true; // Ignore digits once the type's precision limit is reached. This avoids // overflowing the underlying storage while handling a string like // 10000000000e-10 into a DECIMAL(1, 0). Adjustments for ignored digits and // an exponent will be made later. if (LIKELY(type_precision > precision) && !has_round) { value = (value * 10) + (c - '0'); // Benchmarks are faster with parenthesis... ++precision; scale += found_dot; cur_digit = precision - scale; } else if (!found_dot && max_digit < (precision - scale)) { *result = StringParser::PARSE_OVERFLOW; value = is_negative ? vectorized::min_decimal_value(type_precision) : vectorized::max_decimal_value(type_precision); return value; } else if (found_dot && scale >= type_scale && !has_round) { // make rounding cases if (c > '4') { value += 1; } has_round = true; continue; } else if (!found_dot) { ++cur_digit; } DCHECK(value >= 0); // For some reason //DCHECK_GE doesn't work with __int128. } else if (c == '.' && LIKELY(!found_dot)) { found_dot = 1; } else if ((c == 'e' || c == 'E') && LIKELY(!found_exponent)) { found_exponent = true; exponent = string_to_int_internal(s + i + 1, len - i - 1, result); if (UNLIKELY(*result != StringParser::PARSE_SUCCESS)) { if (*result == StringParser::PARSE_OVERFLOW && exponent < 0) { *result = StringParser::PARSE_UNDERFLOW; } return 0; } break; } else { if (value == 0) { *result = StringParser::PARSE_FAILURE; return 0; } // here to handle *result = StringParser::PARSE_SUCCESS; if (type_scale >= scale) { value *= get_scale_multiplier(type_scale - scale); // here meet non-valid character, should return the value, keep going to meet // the E/e character because we make right user-given type_precision // not max number type_precision if (!is_numeric_ascii(c)) { if (cur_digit > type_precision) { *result = StringParser::PARSE_OVERFLOW; value = is_negative ? vectorized::min_decimal_value(type_precision) : vectorized::max_decimal_value( type_precision); return value; } return is_negative ? T(-value) : T(value); } } return is_negative ? T(-value) : T(value); } } // Find the number of truncated digits before adjusting the precision for an exponent. if (exponent > scale) { // Ex: 0.1e3 (which at this point would have precision == 1 and scale == 1), the // scale must be set to 0 and the value set to 100 which means a precision of 3. precision += exponent - scale; value *= get_scale_multiplier(exponent - scale); scale = 0; } else { // Ex: 100e-4, the scale must be set to 4 but no adjustment to the value is needed, // the precision must also be set to 4 but that will be done below for the // non-exponent case anyways. scale -= exponent; } // Ex: 0.001, at this point would have precision 1 and scale 3 since leading zeros // were ignored during previous parsing. if (scale > precision) { precision = scale; } // Microbenchmarks show that beyond this point, returning on parse failure is slower // than just letting the function run out. *result = StringParser::PARSE_SUCCESS; if (UNLIKELY(precision - scale > type_precision - type_scale)) { *result = StringParser::PARSE_OVERFLOW; if constexpr (TYPE_DECIMALV2 != P) { // decimalv3 overflow will return max min value for type precision value = is_negative ? vectorized::min_decimal_value(type_precision) : vectorized::max_decimal_value(type_precision); return value; } } else if (UNLIKELY(scale > type_scale)) { *result = StringParser::PARSE_UNDERFLOW; int shift = scale - type_scale; T divisor = get_scale_multiplier(shift); if (UNLIKELY(divisor == std::numeric_limits::max())) { value = 0; } else { T remainder = value % divisor; value /= divisor; if ((remainder > 0 ? T(remainder) : T(-remainder)) >= (divisor >> 1)) { value += 1; } } DCHECK(value >= 0); // //DCHECK_GE doesn't work with __int128. } else if (UNLIKELY(!found_value && !found_dot)) { *result = StringParser::PARSE_FAILURE; } if (type_scale > scale) { value *= get_scale_multiplier(type_scale - scale); } return is_negative ? T(-value) : T(value); } } // end namespace doris