From cd7a61762fecd62866bb779d36dd52f852131e87 Mon Sep 17 00:00:00 2001 From: Hangjie Mo Date: Thu, 15 May 2025 14:34:52 +0800 Subject: [PATCH] types: fix `decodeEscapedUnicode` func work with `surrogate pair` (#61109) close pingcap/tidb#61091 --- pkg/types/json_binary_functions.go | 46 ++++++++++--- pkg/types/json_binary_functions_test.go | 68 +++++++++++++++++-- .../r/types/json_binary_functions.result | 16 +++++ .../t/types/json_binary_functions.test | 14 ++++ 4 files changed, 128 insertions(+), 16 deletions(-) create mode 100644 tests/integrationtest/r/types/json_binary_functions.result create mode 100644 tests/integrationtest/t/types/json_binary_functions.test diff --git a/pkg/types/json_binary_functions.go b/pkg/types/json_binary_functions.go index 2159b4419a..0d89f6604a 100644 --- a/pkg/types/json_binary_functions.go +++ b/pkg/types/json_binary_functions.go @@ -22,6 +22,7 @@ import ( "math" "slices" "sort" + "unicode/utf16" "unicode/utf8" "github.com/pingcap/errors" @@ -132,8 +133,18 @@ func unquoteJSONString(s string) (string, error) { if i+4 > len(s) { return "", errors.Errorf("Invalid unicode: %s", s[i+1:]) } - char, size, err := decodeEscapedUnicode(hack.Slice(s[i+1 : i+5])) + char, size, inSurrogateRange, err := decodeOneEscapedUnicode(hack.Slice(s[i+1 : i+5])) if err != nil { + // For `surrogate pair`, it uses two `\\uxxx` to encode a character. + if inSurrogateRange && len(s) >= i+10 && s[i+5] == '\\' && s[i+6] == 'u' { + char, size, _, err = decodeOneEscapedUnicode(append([]byte(s[i+1:i+5]), []byte(s[i+7:i+11])...)) + if err != nil { + return "", errors.Trace(err) + } + ret.Write(char[0:size]) + i += 10 + continue + } return "", errors.Trace(err) } ret.Write(char[0:size]) @@ -149,19 +160,34 @@ func unquoteJSONString(s string) (string, error) { return ret.String(), nil } -// decodeEscapedUnicode decodes unicode into utf8 bytes specified in RFC 3629. +// decodeOneEscapedUnicode decodes one unicode into utf8 bytes specified in RFC 3629. // According RFC 3629, the max length of utf8 characters is 4 bytes. // And MySQL use 4 bytes to represent the unicode which must be in [0, 65536). -func decodeEscapedUnicode(s []byte) (char [4]byte, size int, err error) { - size, err = hex.Decode(char[0:2], s) - if err != nil || size != 2 { - // The unicode must can be represented in 2 bytes. - return char, 0, errors.Trace(err) +func decodeOneEscapedUnicode(s []byte) (char [4]byte, size int, inSurrogateRange bool, err error) { + if len(s) > 8 { + return char, 0, false, errors.Errorf("Invalid `s` for decodeEscapedUnicode: %s", s) + } + size, err = hex.Decode(char[0:4], s) + if err != nil { + return char, 0, false, errors.Trace(err) + } + if size != 2 && size != 4 { + // The unicode must can be represented in 2 bytes or 4 bytes. + return char, size, false, errors.Errorf("Invalid unicode length: %d", size) } - unicode := binary.BigEndian.Uint16(char[0:2]) - size = utf8.RuneLen(rune(unicode)) - utf8.EncodeRune(char[0:size], rune(unicode)) + r1 := rune(binary.BigEndian.Uint16(char[0:2])) + if size == 4 { + r1 = utf16.DecodeRune(r1, rune(binary.BigEndian.Uint16(char[2:4]))) + } + size = utf8.RuneLen(r1) + if size < 0 { + if r1 >= 0xD800 && r1 <= 0xDFFF { + inSurrogateRange = true + } + return char, size, inSurrogateRange, errors.Errorf("Invalid unicode: %s", s) + } + utf8.EncodeRune(char[0:size], r1) return } diff --git a/pkg/types/json_binary_functions_test.go b/pkg/types/json_binary_functions_test.go index fa60471412..7e94bee209 100644 --- a/pkg/types/json_binary_functions_test.go +++ b/pkg/types/json_binary_functions_test.go @@ -20,17 +20,73 @@ import ( ) func TestDecodeEscapedUnicode(t *testing.T) { - in := "597d" - r, size, err := decodeEscapedUnicode([]byte(in)) - require.NoError(t, err) - require.Equal(t, "好\x00", string(r[:])) - require.Equal(t, 3, size) + testCases := []struct { + input string + expectedResult string + size int + inSurrogateRange bool + expectedValid bool + }{ + {"597d", "好\x00", 3, false, true}, + {"fffd", "�\x00", 3, false, true}, + {"D83DDE0A", "😊", 4, false, true}, + {"D83D", "", 0, true, false}, + {"D83D11", "", 0, false, false}, + {"ZZZZ", "", 0, false, false}, + {"D83DDE0A597d", "", 0, false, false}, + } + + for _, tc := range testCases { + result, size, inSurrogateRange, err := decodeOneEscapedUnicode([]byte(tc.input)) + require.Equal(t, tc.inSurrogateRange, inSurrogateRange) + if tc.expectedValid { + require.NoError(t, err) + require.Equal(t, tc.expectedResult, string(result[:])) + require.Equal(t, tc.size, size) + } else { + require.Error(t, err) + } + } +} + +func TestUnquoteJSONString(t *testing.T) { + var testCases = []struct { + input string + expectedResult string + expectedValid bool + }{ + {"\\b", "\b", true}, + {"\\f", "\f", true}, + {"\\n", "\n", true}, + {"\\r", "\r", true}, + {"\\t", "\t", true}, + {"\\\\", "\\", true}, + {"\\u597d", "好", true}, + {"0\\u597d0", "0好0", true}, + {"\\a", "a", true}, + {"[", "[", true}, + {"\\ud83e\\udd21", "🤡", true}, + {"\\ufffd", "�", true}, + // invalid input + {"\\", "", false}, + {"\\u59", "", false}, + } + + for _, tc := range testCases { + result, err := unquoteJSONString(tc.input) + if tc.expectedValid { + require.NoError(t, err) + require.Equal(t, tc.expectedResult, result) + } else { + require.Error(t, err) + } + } } func BenchmarkDecodeEscapedUnicode(b *testing.B) { for i := 0; i < b.N; i++ { in := "597d" - _, _, _ = decodeEscapedUnicode([]byte(in)) + _, _, _, _ = decodeOneEscapedUnicode([]byte(in)) } } diff --git a/tests/integrationtest/r/types/json_binary_functions.result b/tests/integrationtest/r/types/json_binary_functions.result new file mode 100644 index 0000000000..266eb3f316 --- /dev/null +++ b/tests/integrationtest/r/types/json_binary_functions.result @@ -0,0 +1,16 @@ +drop table if exists t; +CREATE TABLE `t` ( +`id` int NOT NULL AUTO_INCREMENT, +`value` json DEFAULT NULL, +`value_custom` json DEFAULT NULL, +PRIMARY KEY (`id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci; +INSERT INTO `t` (`value`, `value_custom`) VALUES ('{\"emo\\ud83e\\udd21\'ji\": \"some value\", \"escape\\uffff\'seq\'\\uffffue\\uffff\'nce\": \"some value\"}', NULL); +SELECT 1 FROM `t` WHERE JSON_CONTAINS_PATH(`t`.`value`, 'one', '$."emo\\ud83e\\udd21\'ji"'); +1 +1 +SELECT 1 FROM `t` WHERE JSON_CONTAINS_PATH(`t`.`value`, 'one', '$."escape\\uffff\'seq\'\\uffffue\\uffff\'nce"'); +1 +1 +SELECT 1 FROM `t` WHERE JSON_CONTAINS_PATH(`t`.`value`, 'one', '$."escape\\uffff\'seq\'\\ufffdue\\uffff\'nce"'); +1 diff --git a/tests/integrationtest/t/types/json_binary_functions.test b/tests/integrationtest/t/types/json_binary_functions.test new file mode 100644 index 0000000000..cf35bd5c84 --- /dev/null +++ b/tests/integrationtest/t/types/json_binary_functions.test @@ -0,0 +1,14 @@ +drop table if exists t; +CREATE TABLE `t` ( + `id` int NOT NULL AUTO_INCREMENT, + `value` json DEFAULT NULL, + `value_custom` json DEFAULT NULL, + PRIMARY KEY (`id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci; +INSERT INTO `t` (`value`, `value_custom`) VALUES ('{\"emo\\ud83e\\udd21\'ji\": \"some value\", \"escape\\uffff\'seq\'\\uffffue\\uffff\'nce\": \"some value\"}', NULL); + +SELECT 1 FROM `t` WHERE JSON_CONTAINS_PATH(`t`.`value`, 'one', '$."emo\\ud83e\\udd21\'ji"'); + +SELECT 1 FROM `t` WHERE JSON_CONTAINS_PATH(`t`.`value`, 'one', '$."escape\\uffff\'seq\'\\uffffue\\uffff\'nce"'); + +SELECT 1 FROM `t` WHERE JSON_CONTAINS_PATH(`t`.`value`, 'one', '$."escape\\uffff\'seq\'\\ufffdue\\uffff\'nce"');