types: fix decodeEscapedUnicode func work with surrogate pair (#61109)

close pingcap/tidb#61091
This commit is contained in:
Hangjie Mo
2025-05-15 14:34:52 +08:00
committed by GitHub
parent 617661007e
commit cd7a61762f
4 changed files with 128 additions and 16 deletions

View File

@ -22,6 +22,7 @@ import (
"math"
"slices"
"sort"
"unicode/utf16"
"unicode/utf8"
"github.com/pingcap/errors"
@ -132,8 +133,18 @@ func unquoteJSONString(s string) (string, error) {
if i+4 > len(s) {
return "", errors.Errorf("Invalid unicode: %s", s[i+1:])
}
char, size, err := decodeEscapedUnicode(hack.Slice(s[i+1 : i+5]))
char, size, inSurrogateRange, err := decodeOneEscapedUnicode(hack.Slice(s[i+1 : i+5]))
if err != nil {
// For `surrogate pair`, it uses two `\\uxxx` to encode a character.
if inSurrogateRange && len(s) >= i+10 && s[i+5] == '\\' && s[i+6] == 'u' {
char, size, _, err = decodeOneEscapedUnicode(append([]byte(s[i+1:i+5]), []byte(s[i+7:i+11])...))
if err != nil {
return "", errors.Trace(err)
}
ret.Write(char[0:size])
i += 10
continue
}
return "", errors.Trace(err)
}
ret.Write(char[0:size])
@ -149,19 +160,34 @@ func unquoteJSONString(s string) (string, error) {
return ret.String(), nil
}
// decodeEscapedUnicode decodes unicode into utf8 bytes specified in RFC 3629.
// decodeOneEscapedUnicode decodes one unicode into utf8 bytes specified in RFC 3629.
// According RFC 3629, the max length of utf8 characters is 4 bytes.
// And MySQL use 4 bytes to represent the unicode which must be in [0, 65536).
func decodeEscapedUnicode(s []byte) (char [4]byte, size int, err error) {
size, err = hex.Decode(char[0:2], s)
if err != nil || size != 2 {
// The unicode must can be represented in 2 bytes.
return char, 0, errors.Trace(err)
func decodeOneEscapedUnicode(s []byte) (char [4]byte, size int, inSurrogateRange bool, err error) {
if len(s) > 8 {
return char, 0, false, errors.Errorf("Invalid `s` for decodeEscapedUnicode: %s", s)
}
size, err = hex.Decode(char[0:4], s)
if err != nil {
return char, 0, false, errors.Trace(err)
}
if size != 2 && size != 4 {
// The unicode must can be represented in 2 bytes or 4 bytes.
return char, size, false, errors.Errorf("Invalid unicode length: %d", size)
}
unicode := binary.BigEndian.Uint16(char[0:2])
size = utf8.RuneLen(rune(unicode))
utf8.EncodeRune(char[0:size], rune(unicode))
r1 := rune(binary.BigEndian.Uint16(char[0:2]))
if size == 4 {
r1 = utf16.DecodeRune(r1, rune(binary.BigEndian.Uint16(char[2:4])))
}
size = utf8.RuneLen(r1)
if size < 0 {
if r1 >= 0xD800 && r1 <= 0xDFFF {
inSurrogateRange = true
}
return char, size, inSurrogateRange, errors.Errorf("Invalid unicode: %s", s)
}
utf8.EncodeRune(char[0:size], r1)
return
}

View File

@ -20,17 +20,73 @@ import (
)
func TestDecodeEscapedUnicode(t *testing.T) {
in := "597d"
r, size, err := decodeEscapedUnicode([]byte(in))
require.NoError(t, err)
require.Equal(t, "好\x00", string(r[:]))
require.Equal(t, 3, size)
testCases := []struct {
input string
expectedResult string
size int
inSurrogateRange bool
expectedValid bool
}{
{"597d", "好\x00", 3, false, true},
{"fffd", "�\x00", 3, false, true},
{"D83DDE0A", "😊", 4, false, true},
{"D83D", "", 0, true, false},
{"D83D11", "", 0, false, false},
{"ZZZZ", "", 0, false, false},
{"D83DDE0A597d", "", 0, false, false},
}
for _, tc := range testCases {
result, size, inSurrogateRange, err := decodeOneEscapedUnicode([]byte(tc.input))
require.Equal(t, tc.inSurrogateRange, inSurrogateRange)
if tc.expectedValid {
require.NoError(t, err)
require.Equal(t, tc.expectedResult, string(result[:]))
require.Equal(t, tc.size, size)
} else {
require.Error(t, err)
}
}
}
func TestUnquoteJSONString(t *testing.T) {
var testCases = []struct {
input string
expectedResult string
expectedValid bool
}{
{"\\b", "\b", true},
{"\\f", "\f", true},
{"\\n", "\n", true},
{"\\r", "\r", true},
{"\\t", "\t", true},
{"\\\\", "\\", true},
{"\\u597d", "好", true},
{"0\\u597d0", "0好0", true},
{"\\a", "a", true},
{"[", "[", true},
{"\\ud83e\\udd21", "🤡", true},
{"\\ufffd", "�", true},
// invalid input
{"\\", "", false},
{"\\u59", "", false},
}
for _, tc := range testCases {
result, err := unquoteJSONString(tc.input)
if tc.expectedValid {
require.NoError(t, err)
require.Equal(t, tc.expectedResult, result)
} else {
require.Error(t, err)
}
}
}
func BenchmarkDecodeEscapedUnicode(b *testing.B) {
for i := 0; i < b.N; i++ {
in := "597d"
_, _, _ = decodeEscapedUnicode([]byte(in))
_, _, _, _ = decodeOneEscapedUnicode([]byte(in))
}
}

View File

@ -0,0 +1,16 @@
drop table if exists t;
CREATE TABLE `t` (
`id` int NOT NULL AUTO_INCREMENT,
`value` json DEFAULT NULL,
`value_custom` json DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci;
INSERT INTO `t` (`value`, `value_custom`) VALUES ('{\"emo\\ud83e\\udd21\'ji\": \"some value\", \"escape\\uffff\'seq\'\\uffffue\\uffff\'nce\": \"some value\"}', NULL);
SELECT 1 FROM `t` WHERE JSON_CONTAINS_PATH(`t`.`value`, 'one', '$."emo\\ud83e\\udd21\'ji"');
1
1
SELECT 1 FROM `t` WHERE JSON_CONTAINS_PATH(`t`.`value`, 'one', '$."escape\\uffff\'seq\'\\uffffue\\uffff\'nce"');
1
1
SELECT 1 FROM `t` WHERE JSON_CONTAINS_PATH(`t`.`value`, 'one', '$."escape\\uffff\'seq\'\\ufffdue\\uffff\'nce"');
1

View File

@ -0,0 +1,14 @@
drop table if exists t;
CREATE TABLE `t` (
`id` int NOT NULL AUTO_INCREMENT,
`value` json DEFAULT NULL,
`value_custom` json DEFAULT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci;
INSERT INTO `t` (`value`, `value_custom`) VALUES ('{\"emo\\ud83e\\udd21\'ji\": \"some value\", \"escape\\uffff\'seq\'\\uffffue\\uffff\'nce\": \"some value\"}', NULL);
SELECT 1 FROM `t` WHERE JSON_CONTAINS_PATH(`t`.`value`, 'one', '$."emo\\ud83e\\udd21\'ji"');
SELECT 1 FROM `t` WHERE JSON_CONTAINS_PATH(`t`.`value`, 'one', '$."escape\\uffff\'seq\'\\uffffue\\uffff\'nce"');
SELECT 1 FROM `t` WHERE JSON_CONTAINS_PATH(`t`.`value`, 'one', '$."escape\\uffff\'seq\'\\ufffdue\\uffff\'nce"');