From 4e33a0f2eba4f2b6faa5d9efddb2224a96e46440 Mon Sep 17 00:00:00 2001 From: tangenta Date: Wed, 9 Feb 2022 19:25:36 +0800 Subject: [PATCH] expression: refine invalid char error msg for string conversion (#32199) close pingcap/tidb#30444 --- .../r/new_character_set_builtin.result | 20 ++++++++------ cmd/explaintest/r/select.result | 2 +- .../t/new_character_set_builtin.test | 4 +++ expression/builtin_convert_charset.go | 26 ++++++++++++++++--- 4 files changed, 40 insertions(+), 12 deletions(-) diff --git a/cmd/explaintest/r/new_character_set_builtin.result b/cmd/explaintest/r/new_character_set_builtin.result index 070e813278..96f903a13e 100644 --- a/cmd/explaintest/r/new_character_set_builtin.result +++ b/cmd/explaintest/r/new_character_set_builtin.result @@ -398,13 +398,17 @@ a like 0xe4b880 b like 0xd2bb 1 1 1 1 select a = 0xb6fe from t; -Error 3854: Cannot convert string 'B6FE' from binary to utf8mb4 +Error 3854: Cannot convert string '\xB6\xFE' from binary to utf8mb4 select b = 0xe4ba8c from t; -Error 3854: Cannot convert string 'E4BA8C' from binary to gbk +Error 3854: Cannot convert string '\xE4\xBA\x8C' from binary to gbk select concat(a, 0xb6fe) from t; -Error 3854: Cannot convert string 'B6FE' from binary to utf8mb4 +Error 3854: Cannot convert string '\xB6\xFE' from binary to utf8mb4 select concat(b, 0xe4ba8c) from t; -Error 3854: Cannot convert string 'E4BA8C' from binary to gbk +Error 3854: Cannot convert string '\xE4\xBA\x8C' from binary to gbk +select concat(convert('a' using gbk), 0x3fff) from t; +Error 3854: Cannot convert string '?\xFF' from binary to gbk +select concat(convert('a' using gbk), 0x3fffffffffffffff) from t; +Error 3854: Cannot convert string '?\xFF\xFF\xFF\xFF\xFF...' from binary to gbk set @@tidb_enable_vectorized_expression = false; select hex(concat(a, c)), hex(concat(b, c)) from t; hex(concat(a, c)) hex(concat(b, c)) @@ -497,13 +501,13 @@ a like 0xe4b880 b like 0xd2bb 1 1 1 1 select a = 0xb6fe from t; -Error 3854: Cannot convert string 'B6FE' from binary to utf8mb4 +Error 3854: Cannot convert string '\xB6\xFE' from binary to utf8mb4 select b = 0xe4ba8c from t; -Error 3854: Cannot convert string 'E4BA8C' from binary to gbk +Error 3854: Cannot convert string '\xE4\xBA\x8C' from binary to gbk select concat(a, 0xb6fe) from t; -Error 3854: Cannot convert string 'B6FE' from binary to utf8mb4 +Error 3854: Cannot convert string '\xB6\xFE' from binary to utf8mb4 select concat(b, 0xe4ba8c) from t; -Error 3854: Cannot convert string 'E4BA8C' from binary to gbk +Error 3854: Cannot convert string '\xE4\xBA\x8C' from binary to gbk drop table if exists t; create table t (a char(20) charset utf8mb4, b char(20) charset gbk, c binary(20)); insert into t values ('一二三', '一二三', '一二三'); diff --git a/cmd/explaintest/r/select.result b/cmd/explaintest/r/select.result index f364a577e1..63bdb29e7d 100644 --- a/cmd/explaintest/r/select.result +++ b/cmd/explaintest/r/select.result @@ -498,4 +498,4 @@ a b c d create table t3(a char(10), primary key (a)); insert into t3 values ('a'); select * from t3 where a > 0x80; -Error 1105: Cannot convert string '80' from binary to utf8mb4 +Error 1105: Cannot convert string '\x80' from binary to utf8mb4 diff --git a/cmd/explaintest/t/new_character_set_builtin.test b/cmd/explaintest/t/new_character_set_builtin.test index 5f7d404e3f..93f160832c 100644 --- a/cmd/explaintest/t/new_character_set_builtin.test +++ b/cmd/explaintest/t/new_character_set_builtin.test @@ -200,6 +200,10 @@ select b = 0xe4ba8c from t; select concat(a, 0xb6fe) from t; --error 3854 select concat(b, 0xe4ba8c) from t; +--error 3854 +select concat(convert('a' using gbk), 0x3fff) from t; +--error 3854 +select concat(convert('a' using gbk), 0x3fffffffffffffff) from t; set @@tidb_enable_vectorized_expression = false; select hex(concat(a, c)), hex(concat(b, c)) from t; diff --git a/expression/builtin_convert_charset.go b/expression/builtin_convert_charset.go index 02ba0bfd2a..c21cd32c92 100644 --- a/expression/builtin_convert_charset.go +++ b/expression/builtin_convert_charset.go @@ -17,6 +17,8 @@ package expression import ( "bytes" "fmt" + "strings" + "unicode" "github.com/pingcap/tidb/errno" "github.com/pingcap/tidb/parser/ast" @@ -172,9 +174,10 @@ func (b *builtinInternalFromBinarySig) evalString(row chunk.Row) (res string, is return val, isNull, err } enc := charset.FindEncoding(b.tp.Charset) - ret, err := enc.Transform(nil, hack.Slice(val), charset.OpDecode) + valBytes := hack.Slice(val) + ret, err := enc.Transform(nil, valBytes, charset.OpDecode) if err != nil { - strHex := fmt.Sprintf("%X", val) + strHex := formatInvalidChars(valBytes) err = errCannotConvertString.GenWithStackByArgs(strHex, charset.CharsetBin, b.tp.Charset) } return string(ret), false, err @@ -205,7 +208,7 @@ func (b *builtinInternalFromBinarySig) vecEvalString(input *chunk.Chunk, result str := buf.GetBytes(i) val, err := enc.Transform(encodedBuf, str, charset.OpDecode) if err != nil { - strHex := fmt.Sprintf("%X", str) + strHex := formatInvalidChars(str) return errCannotConvertString.GenWithStackByArgs(strHex, charset.CharsetBin, b.tp.Charset) } result.AppendBytes(val) @@ -334,3 +337,20 @@ func isLegacyCharset(chs string) bool { } return false } + +func formatInvalidChars(src []byte) string { + var sb strings.Builder + const maxBytesToShow = 5 + for i := 0; i < len(src); i++ { + if i > maxBytesToShow { + sb.WriteString("...") + break + } + if src[i] > unicode.MaxASCII { + sb.WriteString(fmt.Sprintf("\\x%X", src[i])) + } else { + sb.Write([]byte{src[i]}) + } + } + return sb.String() +}