[Fix](Nereids) Fix datatype length wrong when string contains chinese (#29885)

When varchar literal contains chinese, the length of varchar should not be the length of the varchar, it should be 
the actual length of the using byte.
Chinese is represented by unicode, a chinese char occypy 4 byte at mostly. So if meet chinese in varchar literal, we 
set the length is 4* length.

for example as following:
>        CREATE MATERIALIZED VIEW test_varchar_literal_mv
>             BUILD IMMEDIATE REFRESH AUTO ON MANUAL
>             DISTRIBUTED BY RANDOM BUCKETS 2
>             PROPERTIES ('replication_num' = '1')
>             AS
>             select case when l_orderkey > 1 then "一二三四" else "五六七八" end as field_1 from lineitem;

mysql> desc test_varchar_literal_mv;
the def of materialized view is as following:
+---------+-------------+------+-------+---------+-------+
| Field   | Type        | Null | Key   | Default | Extra |
+---------+-------------+------+-------+---------+-------+
| field_1 | VARCHAR(16) | No   | false | NULL    | NONE  |
+---------+-------------+------+-------+---------+-------+
This commit is contained in:
seawinde
2024-01-12 18:53:29 +08:00
committed by yiguolei
parent 115815739c
commit 0b16938b7f
6 changed files with 119 additions and 4 deletions

View File

@ -25,6 +25,7 @@ import org.apache.doris.analysis.UserIdentity;
import org.apache.doris.catalog.AggregateType;
import org.apache.doris.catalog.Env;
import org.apache.doris.catalog.KeysType;
import org.apache.doris.catalog.ScalarType;
import org.apache.doris.common.Config;
import org.apache.doris.common.FeConstants;
import org.apache.doris.common.Pair;
@ -316,6 +317,7 @@ import org.apache.doris.nereids.trees.expressions.literal.Literal;
import org.apache.doris.nereids.trees.expressions.literal.MapLiteral;
import org.apache.doris.nereids.trees.expressions.literal.NullLiteral;
import org.apache.doris.nereids.trees.expressions.literal.SmallIntLiteral;
import org.apache.doris.nereids.trees.expressions.literal.StringLikeLiteral;
import org.apache.doris.nereids.trees.expressions.literal.StringLiteral;
import org.apache.doris.nereids.trees.expressions.literal.StructLiteral;
import org.apache.doris.nereids.trees.expressions.literal.TinyIntLiteral;
@ -406,6 +408,7 @@ import org.apache.doris.nereids.types.VarcharType;
import org.apache.doris.nereids.types.coercion.CharacterType;
import org.apache.doris.nereids.util.ExpressionUtils;
import org.apache.doris.nereids.util.RelationUtil;
import org.apache.doris.nereids.util.Utils;
import org.apache.doris.policy.FilterType;
import org.apache.doris.policy.PolicyTypeEnum;
import org.apache.doris.qe.ConnectContext;
@ -2104,7 +2107,11 @@ public class LogicalPlanBuilder extends DorisParserBaseVisitor<Object> {
if (!SqlModeHelper.hasNoBackSlashEscapes()) {
s = LogicalPlanBuilderAssistant.escapeBackSlash(s);
}
return new VarcharLiteral(s);
int strLength = Utils.containChinese(s) ? s.length() * StringLikeLiteral.CHINESE_CHAR_BYTE_LENGTH : s.length();
if (strLength > ScalarType.MAX_VARCHAR_LENGTH) {
return new StringLiteral(s);
}
return new VarcharLiteral(s, strLength);
}
/**

View File

@ -21,8 +21,11 @@ import org.apache.doris.nereids.types.DataType;
import java.util.Objects;
/** StringLikeLiteral. */
/**
* StringLikeLiteral.
*/
public abstract class StringLikeLiteral extends Literal {
public static final int CHINESE_CHAR_BYTE_LENGTH = 4;
public final String value;
public StringLikeLiteral(String value, DataType dataType) {

View File

@ -277,4 +277,16 @@ public class Utils {
}
return CaseFormat.UPPER_CAMEL.to(CaseFormat.LOWER_UNDERSCORE, name);
}
/**
* Check the content if contains chinese or not, if true when contains chinese or false
*/
public static boolean containChinese(String text) {
for (char textChar : text.toCharArray()) {
if (Character.UnicodeScript.of(textChar) == Character.UnicodeScript.HAN) {
return true;
}
}
return false;
}
}