From 0b16938b7f4c6b0e8732a1eac1a55dc75f9d20cf Mon Sep 17 00:00:00 2001 From: seawinde <149132972+seawinde@users.noreply.github.com> Date: Fri, 12 Jan 2024 18:53:29 +0800 Subject: [PATCH] [Fix](Nereids) Fix datatype length wrong when string contains chinese (#29885) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When varchar literal contains chinese, the length of varchar should not be the length of the varchar, it should be the actual length of the using byte. Chinese is represented by unicode, a chinese char occypy 4 byte at mostly. So if meet chinese in varchar literal, we set the length is 4* length. for example as following: > CREATE MATERIALIZED VIEW test_varchar_literal_mv > BUILD IMMEDIATE REFRESH AUTO ON MANUAL > DISTRIBUTED BY RANDOM BUCKETS 2 > PROPERTIES ('replication_num' = '1') > AS > select case when l_orderkey > 1 then "一二三四" else "五六七八" end as field_1 from lineitem; mysql> desc test_varchar_literal_mv; the def of materialized view is as following: +---------+-------------+------+-------+---------+-------+ | Field | Type | Null | Key | Default | Extra | +---------+-------------+------+-------+---------+-------+ | field_1 | VARCHAR(16) | No | false | NULL | NONE | +---------+-------------+------+-------+---------+-------+ --- .../nereids/parser/LogicalPlanBuilder.java | 9 ++- .../literal/StringLikeLiteral.java | 5 +- .../org/apache/doris/nereids/util/Utils.java | 12 ++++ .../apache/doris/nereids/util/UtilsTest.java | 35 +++++++++++ .../data/mtmv_p0/test_build_mtmv.out | 3 + .../suites/mtmv_p0/test_build_mtmv.groovy | 59 ++++++++++++++++++- 6 files changed, 119 insertions(+), 4 deletions(-) create mode 100644 fe/fe-core/src/test/java/org/apache/doris/nereids/util/UtilsTest.java diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java index fda2c70d79..647b65a641 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/parser/LogicalPlanBuilder.java @@ -25,6 +25,7 @@ import org.apache.doris.analysis.UserIdentity; import org.apache.doris.catalog.AggregateType; import org.apache.doris.catalog.Env; import org.apache.doris.catalog.KeysType; +import org.apache.doris.catalog.ScalarType; import org.apache.doris.common.Config; import org.apache.doris.common.FeConstants; import org.apache.doris.common.Pair; @@ -316,6 +317,7 @@ import org.apache.doris.nereids.trees.expressions.literal.Literal; import org.apache.doris.nereids.trees.expressions.literal.MapLiteral; import org.apache.doris.nereids.trees.expressions.literal.NullLiteral; import org.apache.doris.nereids.trees.expressions.literal.SmallIntLiteral; +import org.apache.doris.nereids.trees.expressions.literal.StringLikeLiteral; import org.apache.doris.nereids.trees.expressions.literal.StringLiteral; import org.apache.doris.nereids.trees.expressions.literal.StructLiteral; import org.apache.doris.nereids.trees.expressions.literal.TinyIntLiteral; @@ -406,6 +408,7 @@ import org.apache.doris.nereids.types.VarcharType; import org.apache.doris.nereids.types.coercion.CharacterType; import org.apache.doris.nereids.util.ExpressionUtils; import org.apache.doris.nereids.util.RelationUtil; +import org.apache.doris.nereids.util.Utils; import org.apache.doris.policy.FilterType; import org.apache.doris.policy.PolicyTypeEnum; import org.apache.doris.qe.ConnectContext; @@ -2104,7 +2107,11 @@ public class LogicalPlanBuilder extends DorisParserBaseVisitor { if (!SqlModeHelper.hasNoBackSlashEscapes()) { s = LogicalPlanBuilderAssistant.escapeBackSlash(s); } - return new VarcharLiteral(s); + int strLength = Utils.containChinese(s) ? s.length() * StringLikeLiteral.CHINESE_CHAR_BYTE_LENGTH : s.length(); + if (strLength > ScalarType.MAX_VARCHAR_LENGTH) { + return new StringLiteral(s); + } + return new VarcharLiteral(s, strLength); } /** diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/StringLikeLiteral.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/StringLikeLiteral.java index ffefc3e55c..e0e28d9399 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/StringLikeLiteral.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/StringLikeLiteral.java @@ -21,8 +21,11 @@ import org.apache.doris.nereids.types.DataType; import java.util.Objects; -/** StringLikeLiteral. */ +/** + * StringLikeLiteral. + */ public abstract class StringLikeLiteral extends Literal { + public static final int CHINESE_CHAR_BYTE_LENGTH = 4; public final String value; public StringLikeLiteral(String value, DataType dataType) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/util/Utils.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/util/Utils.java index baf37c1c64..186a3e4529 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/util/Utils.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/util/Utils.java @@ -277,4 +277,16 @@ public class Utils { } return CaseFormat.UPPER_CAMEL.to(CaseFormat.LOWER_UNDERSCORE, name); } + + /** + * Check the content if contains chinese or not, if true when contains chinese or false + */ + public static boolean containChinese(String text) { + for (char textChar : text.toCharArray()) { + if (Character.UnicodeScript.of(textChar) == Character.UnicodeScript.HAN) { + return true; + } + } + return false; + } } diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/util/UtilsTest.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/util/UtilsTest.java new file mode 100644 index 0000000000..0c7d903311 --- /dev/null +++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/util/UtilsTest.java @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.util; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +/** + * The tests for utils + */ +public class UtilsTest { + @Test + public void containChinese() { + String chinese = "123数据库"; + Assertions.assertTrue(Utils.containChinese(chinese)); + + String en = "database123"; + Assertions.assertFalse(Utils.containChinese(en)); + } +} diff --git a/regression-test/data/mtmv_p0/test_build_mtmv.out b/regression-test/data/mtmv_p0/test_build_mtmv.out index ad0100fe27..4c2b4ea752 100644 --- a/regression-test/data/mtmv_p0/test_build_mtmv.out +++ b/regression-test/data/mtmv_p0/test_build_mtmv.out @@ -60,3 +60,6 @@ zhangsang 200 -- !select_union -- 11 111 +-- !desc_mv -- +field_1 VARCHAR(16) No false \N NONE + diff --git a/regression-test/suites/mtmv_p0/test_build_mtmv.groovy b/regression-test/suites/mtmv_p0/test_build_mtmv.groovy index 05bc5e5322..fd6bec3fcb 100644 --- a/regression-test/suites/mtmv_p0/test_build_mtmv.groovy +++ b/regression-test/suites/mtmv_p0/test_build_mtmv.groovy @@ -38,7 +38,7 @@ suite("test_build_mtmv") { id BIGINT, username VARCHAR(20) ) - DISTRIBUTED BY HASH(id) BUCKETS 10 + DISTRIBUTED BY HASH(id) BUCKETS 10 PROPERTIES ( "replication_num" = "1" ); @@ -52,7 +52,7 @@ suite("test_build_mtmv") { id BIGINT, pv BIGINT ) - DISTRIBUTED BY HASH(id) BUCKETS 10 + DISTRIBUTED BY HASH(id) BUCKETS 10 PROPERTIES ( "replication_num" = "1" ); @@ -580,4 +580,59 @@ suite("test_build_mtmv") { sql """ DROP MATERIALIZED VIEW ${mvName} """ + + // test build mv which containing literal varchar field + sql """ + drop table if exists lineitem + """ + sql """ + CREATE TABLE IF NOT EXISTS lineitem ( + l_orderkey INTEGER NOT NULL, + l_partkey INTEGER NOT NULL, + l_suppkey INTEGER NOT NULL, + l_linenumber INTEGER NOT NULL, + l_quantity DECIMALV3(15,2) NOT NULL, + l_extendedprice DECIMALV3(15,2) NOT NULL, + l_discount DECIMALV3(15,2) NOT NULL, + l_tax DECIMALV3(15,2) NOT NULL, + l_returnflag CHAR(1) NOT NULL, + l_linestatus CHAR(1) NOT NULL, + l_shipdate DATE NOT NULL, + l_commitdate DATE NOT NULL, + l_receiptdate DATE NOT NULL, + l_shipinstruct CHAR(25) NOT NULL, + l_shipmode CHAR(10) NOT NULL, + l_comment VARCHAR(44) NOT NULL + ) + DUPLICATE KEY(l_orderkey, l_partkey, l_suppkey, l_linenumber) + PARTITION BY RANGE(l_shipdate) ( + PARTITION `day_2` VALUES LESS THAN ('2023-12-9'), + PARTITION `day_3` VALUES LESS THAN ("2023-12-11"), + PARTITION `day_4` VALUES LESS THAN ("2023-12-30") + ) + DISTRIBUTED BY HASH(l_orderkey) BUCKETS 3 + PROPERTIES ( + "replication_num" = "1" + ) + """ + + sql """ + insert into lineitem values + (1, 2, 3, 4, 5.5, 6.5, 7.5, 8.5, 'o', 'k', '2023-12-08', '2023-12-09', '2023-12-10', 'a', 'b', 'yyyyyyyyy'), + (2, 4, 3, 4, 5.5, 6.5, 7.5, 8.5, 'o', 'k', '2023-12-09', '2023-12-09', '2023-12-10', 'a', 'b', 'yyyyyyyyy'), + (3, 2, 4, 4, 5.5, 6.5, 7.5, 8.5, 'o', 'k', '2023-12-10', '2023-12-09', '2023-12-10', 'a', 'b', 'yyyyyyyyy'), + (4, 3, 3, 4, 5.5, 6.5, 7.5, 8.5, 'o', 'k', '2023-12-11', '2023-12-09', '2023-12-10', 'a', 'b', 'yyyyyyyyy'), + (5, 2, 3, 6, 7.5, 8.5, 9.5, 10.5, 'k', 'o', '2023-12-12', '2023-12-12', '2023-12-13', 'c', 'd', 'xxxxxxxxx'); + """ + + sql """DROP MATERIALIZED VIEW IF EXISTS test_varchar_literal_mv;""" + sql """ + CREATE MATERIALIZED VIEW test_varchar_literal_mv + BUILD IMMEDIATE REFRESH AUTO ON MANUAL + DISTRIBUTED BY RANDOM BUCKETS 2 + PROPERTIES ('replication_num' = '1') + AS + select case when l_orderkey > 1 then "一二三四" else "五六七八" end as field_1 from lineitem; + """ + qt_desc_mv """desc test_varchar_literal_mv;""" }