[fix](decimal) Fix long string casting to decimalv2 (#35121)

2024-05-22 14:24:22 +08:00
parent 84f7bfffe2
commit c23384ff07
8 changed files with 124 additions and 194 deletions
--- a/be/src/util/string_parser.hpp
+++ b/be/src/util/string_parser.hpp
@ -594,123 +594,74 @@ T StringParser::string_to_decimal(const char* __restrict s, int len, int type_pr
    bool found_exponent = false;
    int8_t exponent = 0;
    T value = 0;
-    if constexpr (TYPE_DECIMALV2 == P) {
-        // decimalv2 do not care type_scale and type_precision,just keep the origin logic
-        for (int i = 0; i < len; ++i) {
-            const char& c = s[i];
-            if (LIKELY('0' <= c && c <= '9')) {
-                found_value = true;
-                // Ignore digits once the type's precision limit is reached. This avoids
-                // overflowing the underlying storage while handling a string like
-                // 10000000000e-10 into a DECIMAL(1, 0). Adjustments for ignored digits and
-                // an exponent will be made later.
-                if (LIKELY(type_precision > precision)) {
-                    value = (value * 10) + (c - '0'); // Benchmarks are faster with parenthesis...
-                } else {
-                    *result = StringParser::PARSE_OVERFLOW;
-                    value = is_negative
-                                    ? vectorized::min_decimal_value<DecimalType>(type_precision)
-                                    : vectorized::max_decimal_value<DecimalType>(type_precision);
-                    return value;
-                }
-                DCHECK(value >= 0); // For some reason //DCHECK_GE doesn't work with __int128.
+    bool has_round = false;
+    for (int i = 0; i < len; ++i) {
+        const char& c = s[i];
+        if (LIKELY('0' <= c && c <= '9')) {
+            found_value = true;
+            // Ignore digits once the type's precision limit is reached. This avoids
+            // overflowing the underlying storage while handling a string like
+            // 10000000000e-10 into a DECIMAL(1, 0). Adjustments for ignored digits and
+            // an exponent will be made later.
+            if (LIKELY(type_precision > precision) && !has_round) {
+                value = (value * 10) + (c - '0'); // Benchmarks are faster with parenthesis...
                ++precision;
                scale += found_dot;
-            } else if (c == '.' && LIKELY(!found_dot)) {
-                found_dot = 1;
-            } else if ((c == 'e' || c == 'E') && LIKELY(!found_exponent)) {
-                found_exponent = true;
-                exponent = string_to_int_internal<int8_t>(s + i + 1, len - i - 1, result);
-                if (UNLIKELY(*result != StringParser::PARSE_SUCCESS)) {
-                    if (*result == StringParser::PARSE_OVERFLOW && exponent < 0) {
-                        *result = StringParser::PARSE_UNDERFLOW;
-                    }
-                    return 0;
-                }
-                break;
-            } else {
-                if (value == 0) {
-                    *result = StringParser::PARSE_FAILURE;
-                    return 0;
-                }
-                *result = StringParser::PARSE_SUCCESS;
-                value *= get_scale_multiplier<T>(type_scale - scale);
-
-                return is_negative ? T(-value) : T(value);
-            }
-        }
-    } else {
-        // decimalv3
-        bool has_round = false;
-        for (int i = 0; i < len; ++i) {
-            const char& c = s[i];
-            if (LIKELY('0' <= c && c <= '9')) {
-                found_value = true;
-                // Ignore digits once the type's precision limit is reached. This avoids
-                // overflowing the underlying storage while handling a string like
-                // 10000000000e-10 into a DECIMAL(1, 0). Adjustments for ignored digits and
-                // an exponent will be made later.
-                if (LIKELY(type_precision > precision) && !has_round) {
-                    value = (value * 10) + (c - '0'); // Benchmarks are faster with parenthesis...
-                    ++precision;
-                    scale += found_dot;
-                    cur_digit = precision - scale;
-                } else if (!found_dot && max_digit < (precision - scale)) {
-                    *result = StringParser::PARSE_OVERFLOW;
-                    value = is_negative
-                                    ? vectorized::min_decimal_value<DecimalType>(type_precision)
+                cur_digit = precision - scale;
+            } else if (!found_dot && max_digit < (precision - scale)) {
+                *result = StringParser::PARSE_OVERFLOW;
+                value = is_negative ? vectorized::min_decimal_value<DecimalType>(type_precision)
                                    : vectorized::max_decimal_value<DecimalType>(type_precision);
-                    return value;
-                } else if (found_dot && scale >= type_scale && !has_round) {
-                    // make rounding cases
-                    if (c > '4') {
-                        value += 1;
-                    }
-                    has_round = true;
-                    continue;
-                } else if (!found_dot) {
-                    ++cur_digit;
+                return value;
+            } else if (found_dot && scale >= type_scale && !has_round) {
+                // make rounding cases
+                if (c > '4') {
+                    value += 1;
                }
-                DCHECK(value >= 0); // For some reason //DCHECK_GE doesn't work with __int128.
-            } else if (c == '.' && LIKELY(!found_dot)) {
-                found_dot = 1;
-            } else if ((c == 'e' || c == 'E') && LIKELY(!found_exponent)) {
-                found_exponent = true;
-                exponent = string_to_int_internal<int8_t>(s + i + 1, len - i - 1, result);
-                if (UNLIKELY(*result != StringParser::PARSE_SUCCESS)) {
-                    if (*result == StringParser::PARSE_OVERFLOW && exponent < 0) {
-                        *result = StringParser::PARSE_UNDERFLOW;
-                    }
-                    return 0;
-                }
-                break;
-            } else {
-                if (value == 0) {
-                    *result = StringParser::PARSE_FAILURE;
-                    return 0;
-                }
-                // here to handle
-                *result = StringParser::PARSE_SUCCESS;
-                if (type_scale >= scale) {
-                    value *= get_scale_multiplier<T>(type_scale - scale);
-                    // here meet non-valid character, should return the value, keep going to meet
-                    // the E/e character because we make right user-given type_precision
-                    // not max number type_precision
-                    if (!is_numeric_ascii(c)) {
-                        if (cur_digit > type_precision) {
-                            *result = StringParser::PARSE_OVERFLOW;
-                            value = is_negative ? vectorized::min_decimal_value<DecimalType>(
-                                                          type_precision)
-                                                : vectorized::max_decimal_value<DecimalType>(
-                                                          type_precision);
-                            return value;
-                        }
-                        return is_negative ? T(-value) : T(value);
-                    }
-                }
-
-                return is_negative ? T(-value) : T(value);
+                has_round = true;
+                continue;
+            } else if (!found_dot) {
+                ++cur_digit;
            }
+            DCHECK(value >= 0); // For some reason //DCHECK_GE doesn't work with __int128.
+        } else if (c == '.' && LIKELY(!found_dot)) {
+            found_dot = 1;
+        } else if ((c == 'e' || c == 'E') && LIKELY(!found_exponent)) {
+            found_exponent = true;
+            exponent = string_to_int_internal<int8_t>(s + i + 1, len - i - 1, result);
+            if (UNLIKELY(*result != StringParser::PARSE_SUCCESS)) {
+                if (*result == StringParser::PARSE_OVERFLOW && exponent < 0) {
+                    *result = StringParser::PARSE_UNDERFLOW;
+                }
+                return 0;
+            }
+            break;
+        } else {
+            if (value == 0) {
+                *result = StringParser::PARSE_FAILURE;
+                return 0;
+            }
+            // here to handle
+            *result = StringParser::PARSE_SUCCESS;
+            if (type_scale >= scale) {
+                value *= get_scale_multiplier<T>(type_scale - scale);
+                // here meet non-valid character, should return the value, keep going to meet
+                // the E/e character because we make right user-given type_precision
+                // not max number type_precision
+                if (!is_numeric_ascii(c)) {
+                    if (cur_digit > type_precision) {
+                        *result = StringParser::PARSE_OVERFLOW;
+                        value = is_negative
+                                        ? vectorized::min_decimal_value<DecimalType>(type_precision)
+                                        : vectorized::max_decimal_value<DecimalType>(
+                                                  type_precision);
+                        return value;
+                    }
+                    return is_negative ? T(-value) : T(value);
+                }
+            }
+
+            return is_negative ? T(-value) : T(value);
        }
    }

--- a/be/test/vec/data_types/from_string_test.cpp
+++ b/be/test/vec/data_types/from_string_test.cpp
@ -103,7 +103,7 @@ TEST(FromStringTest, ScalaWrapperFieldVsDataType) {
                         "12345678901234567.012345677", "12345678901234567.012345677",
                         "999999999999999999.999999999"},
                        {"12345678901234567.012345678", "123456789012345678.012345670",
-                         "12345678901234567.012345678", "", ""}),
+                         "12345678901234567.012345678", "12345678901234567.012345678", ""}),
                // decimal32 ==>  decimal32(9,2)
                FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_DECIMAL32,
                                  // (7,2)         (6,3)         (7,3)           (8,1)
--- a/be/test/vec/data_types/serde/data_type_serde_csv_test.cpp
+++ b/be/test/vec/data_types/serde/data_type_serde_csv_test.cpp
@ -74,21 +74,22 @@ TEST(CsvSerde, ScalaDataTypeSerdeCsvTest) {
                FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_STRING, {"doris be better"},
                                  {"doris be better"}),
                // decimal ==> decimalv2(decimal<128>(27,9))
-                FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_DECIMAL,
-                                  {
-                                          // (17, 9)(first 0 will ignore)
-                                          "012345678901234567.012345678",
-                                          // (18, 8) (automatically fill 0 for scala)
-                                          "123456789012345678.01234567",
-                                          // (17, 10) (rounding last to make it fit)
-                                          "12345678901234567.0123456779",
-                                          // (17, 11) (rounding last to make it fit)
-                                          "12345678901234567.01234567791",
-                                          // (19, 8) (wrong)
-                                          "1234567890123456789.01234567",
-                                  },
-                                  {"12345678901234567.012345678", "123456789012345678.012345670",
-                                   "12345678901234567.012345678", "", ""}),
+                FieldType_RandStr(
+                        FieldType::OLAP_FIELD_TYPE_DECIMAL,
+                        {
+                                // (17, 9)(first 0 will ignore)
+                                "012345678901234567.012345678",
+                                // (18, 8) (automatically fill 0 for scala)
+                                "123456789012345678.01234567",
+                                // (17, 10) (rounding last to make it fit)
+                                "12345678901234567.0123456779",
+                                // (17, 11) (rounding last to make it fit)
+                                "12345678901234567.01234567791",
+                                // (19, 8) (wrong)
+                                "1234567890123456789.01234567",
+                        },
+                        {"12345678901234567.012345678", "123456789012345678.012345670",
+                         "12345678901234567.012345678", "12345678901234567.012345678", ""}),
                // decimal32 ==>  decimal32(9,2)                       (7,2)         (6,3)         (7,3)           (8,1)
                FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_DECIMAL32,
                                  {"1234567.12", "123456.123", "1234567.123", "12345679.1"},
--- a/be/test/vec/data_types/serde/data_type_serde_text_test.cpp
+++ b/be/test/vec/data_types/serde/data_type_serde_text_test.cpp
@ -74,21 +74,22 @@ TEST(TextSerde, ScalaDataTypeSerdeTextTest) {
                FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_STRING, {"doris be better"},
                                  {"doris be better"}),
                // decimal ==> decimalv2(decimal<128>(27,9))
-                FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_DECIMAL,
-                                  {
-                                          // (17, 9)(first 0 will ignore)
-                                          "012345678901234567.012345678",
-                                          // (18, 8) (automatically fill 0 for scala)
-                                          "123456789012345678.01234567",
-                                          // (17, 10) (rounding last to make it fit)
-                                          "12345678901234567.0123456779",
-                                          // (17, 11) (rounding last to make it fit)
-                                          "12345678901234567.01234567791",
-                                          // (19, 8) (wrong)
-                                          "1234567890123456789.01234567",
-                                  },
-                                  {"12345678901234567.012345678", "123456789012345678.012345670",
-                                   "12345678901234567.012345678", "", ""}),
+                FieldType_RandStr(
+                        FieldType::OLAP_FIELD_TYPE_DECIMAL,
+                        {
+                                // (17, 9)(first 0 will ignore)
+                                "012345678901234567.012345678",
+                                // (18, 8) (automatically fill 0 for scala)
+                                "123456789012345678.01234567",
+                                // (17, 10) (rounding last to make it fit)
+                                "12345678901234567.0123456779",
+                                // (17, 11) (rounding last to make it fit)
+                                "12345678901234567.01234567791",
+                                // (19, 8) (wrong)
+                                "1234567890123456789.01234567",
+                        },
+                        {"12345678901234567.012345678", "123456789012345678.012345670",
+                         "12345678901234567.012345678", "12345678901234567.012345678", ""}),
                // decimal32 ==>  decimal32(9,2)                       (7,2)         (6,3)         (7,3)           (8,1)
                FieldType_RandStr(FieldType::OLAP_FIELD_TYPE_DECIMAL32,
                                  {"1234567.12", "123456.123", "1234567.123", "12345679.1"},
@ -429,13 +430,13 @@ TEST(TextSerde, ComplexTypeSerdeTextTest) {
                         "[\\1234567890123456789.01234567\\]"},
                        {"[4.000000000, 5.500000000, 6.670000000]",
                         "[12345678901234567.012345678, 123456789012345678.012345670, "
-                         "12345678901234567.012345678, null, null]",
+                         "12345678901234567.012345678, 12345678901234567.012345678, null]",
                         "[null, null, null, null, null]", "[null]"},
                        {"[4.000000000, 5.500000000, 6.670000000]",
                         "[12345678901234567.012345678, 123456789012345678.012345670, "
-                         "12345678901234567.012345678, null, null]",
+                         "12345678901234567.012345678, 12345678901234567.012345678, null]",
                         "[12345678901234567.012345678, 123456789012345678.012345670, "
-                         "12345678901234567.012345678, null, null]",
+                         "12345678901234567.012345678, 12345678901234567.012345678, null]",
                         "[null]"}),
        };
        // array type
--- a/regression-test/data/datatype_p0/decimalv2/test_decimalv2_load.out
+++ b/regression-test/data/datatype_p0/decimalv2/test_decimalv2_load.out
@ -15,11 +15,3 @@
 11.99990
 837.43444

-- !decimalv2_insert --
-999999999999999999.999999999	1.000000000
-999999999999999999.999999999	2.000000000
-999999999999999999.999999999	3.000000000
-999999999999999999.999999999	4.000000000
-999999999999999999.999999999	5.000000000
-999999999999999999.999999999	6.000000000
-
--- a/regression-test/data/datatype_p0/decimalv2/test_decimalv2_overflow2.out
+++ b/regression-test/data/datatype_p0/decimalv2/test_decimalv2_overflow2.out
@ -27,10 +27,10 @@
 999999999999999999.999999999

 -- !multi_overflow2 --
-999999999999999999.999999999	999999999999999999.999999999000000000
+999999999999999999.999999999	999999999999999999.999999999

 -- !multi_overflow3 --
-999999999999999999.999999999	999999999999999999.999999999000000000
+999999999999999999.999999999	999999999999999999.999999999

 -- !multi_overflow4 --
 999999999999999999.999999999	1.000000000	999999999999999999.999999999
@ -39,10 +39,10 @@
 99999999999999999.999999999	0.100000000	999999999999999999.999999990

 -- !div_overflow2 --
-999999999999999999.999999990
+999999999999999999.99999999

 -- !div_overflow3 --
-99999999999999999.999999999	0.1	999999999999999999.9999999900000
+99999999999999999.999999999	0.1	999999999999999999.999999990

 -- !div_overflow4 --
 999999999999999999.999999990
@ -59,3 +59,6 @@
 -- !mod4 --
 0.099999999

+-- !sql --
+2023-12-18T00:00	95357.10
+
--- a/regression-test/suites/datatype_p0/decimalv2/test_decimalv2_load.groovy
+++ b/regression-test/suites/datatype_p0/decimalv2/test_decimalv2_load.groovy
@ -84,42 +84,6 @@ suite("test_decimalv2_load", "nonConcurrent") {
        select * from ${tableName2} order by 1;
    """

-    sql """
-        drop table if exists test_decimalv2_insert;
-    """
-    sql """
-        CREATE TABLE `test_decimalv2_insert` (
-            `k1` decimalv2(27, 9) null,
-            `k2` decimalv2(27, 9) null
-        )
-        DISTRIBUTED BY HASH(`k1`) BUCKETS 10
-        PROPERTIES (
-        "replication_num" = "1"
-        );
-    """
-    sql "set enable_insert_strict=true;"
-    // overflow, max is inserted
-    sql """
-        insert into test_decimalv2_insert values("999999999999999999999999999999",1);
-    """
-    // underflow, min is inserted
-    sql """
-        insert into test_decimalv2_insert values("-999999999999999999999999999999",2);
-    """
-    sql """
-        insert into test_decimalv2_insert values("999999999999999999.9999999991",3);
-    """
-    sql """
-        insert into test_decimalv2_insert values("-999999999999999999.9999999991",4);
-    """
-    sql """
-        insert into test_decimalv2_insert values("999999999999999999.9999999995",5);
-    """
-    sql """
-        insert into test_decimalv2_insert values("-999999999999999999.9999999995",6);
-    """
-    qt_decimalv2_insert "select * from test_decimalv2_insert order by 2; "
-
    sql """
        admin set frontend config("enable_decimal_conversion" = "true");
    """
--- a/regression-test/suites/datatype_p0/decimalv2/test_decimalv2_overflow2.groovy
+++ b/regression-test/suites/datatype_p0/decimalv2/test_decimalv2_overflow2.groovy
@ -269,6 +269,24 @@ suite("test_decimalv2_overflow2") {
    """


+    sql """ drop TABLE if exists test_table """
+    sql """ CREATE TABLE `test_table` (
+            `day_date` datetime NULL COMMENT '',
+            `growth_money` decimalv2(18, 2) NULL COMMENT ''
+            ) ENGINE=OLAP
+            UNIQUE KEY(`day_date`)
+            COMMENT ''
+            DISTRIBUTED BY HASH(`day_date`) BUCKETS 4
+            PROPERTIES (
+            "replication_allocation" = "tag.location.default: 1",
+            "is_being_synced" = "false",
+            "storage_format" = "V2",
+            "disable_auto_compaction" = "false",
+            "enable_single_replica_compaction" = "false"
+            ); """
+    sql """ insert into test_table values ('2023-12-18', '95357.100000000000000000000000000000000000')"""
+    qt_sql """ select * from test_table """
+    sql """ drop TABLE if exists test_table """
    // TODO
    // decimalv2 +-*/ integer
    // integer +-*/ decimalv2