types: fix decodeEscapedUnicode func work with surrogate pair (#61109)
close pingcap/tidb#61091
This commit is contained in:
@ -22,6 +22,7 @@ import (
|
||||
"math"
|
||||
"slices"
|
||||
"sort"
|
||||
"unicode/utf16"
|
||||
"unicode/utf8"
|
||||
|
||||
"github.com/pingcap/errors"
|
||||
@ -132,8 +133,18 @@ func unquoteJSONString(s string) (string, error) {
|
||||
if i+4 > len(s) {
|
||||
return "", errors.Errorf("Invalid unicode: %s", s[i+1:])
|
||||
}
|
||||
char, size, err := decodeEscapedUnicode(hack.Slice(s[i+1 : i+5]))
|
||||
char, size, inSurrogateRange, err := decodeOneEscapedUnicode(hack.Slice(s[i+1 : i+5]))
|
||||
if err != nil {
|
||||
// For `surrogate pair`, it uses two `\\uxxx` to encode a character.
|
||||
if inSurrogateRange && len(s) >= i+10 && s[i+5] == '\\' && s[i+6] == 'u' {
|
||||
char, size, _, err = decodeOneEscapedUnicode(append([]byte(s[i+1:i+5]), []byte(s[i+7:i+11])...))
|
||||
if err != nil {
|
||||
return "", errors.Trace(err)
|
||||
}
|
||||
ret.Write(char[0:size])
|
||||
i += 10
|
||||
continue
|
||||
}
|
||||
return "", errors.Trace(err)
|
||||
}
|
||||
ret.Write(char[0:size])
|
||||
@ -149,19 +160,34 @@ func unquoteJSONString(s string) (string, error) {
|
||||
return ret.String(), nil
|
||||
}
|
||||
|
||||
// decodeEscapedUnicode decodes unicode into utf8 bytes specified in RFC 3629.
|
||||
// decodeOneEscapedUnicode decodes one unicode into utf8 bytes specified in RFC 3629.
|
||||
// According RFC 3629, the max length of utf8 characters is 4 bytes.
|
||||
// And MySQL use 4 bytes to represent the unicode which must be in [0, 65536).
|
||||
func decodeEscapedUnicode(s []byte) (char [4]byte, size int, err error) {
|
||||
size, err = hex.Decode(char[0:2], s)
|
||||
if err != nil || size != 2 {
|
||||
// The unicode must can be represented in 2 bytes.
|
||||
return char, 0, errors.Trace(err)
|
||||
func decodeOneEscapedUnicode(s []byte) (char [4]byte, size int, inSurrogateRange bool, err error) {
|
||||
if len(s) > 8 {
|
||||
return char, 0, false, errors.Errorf("Invalid `s` for decodeEscapedUnicode: %s", s)
|
||||
}
|
||||
size, err = hex.Decode(char[0:4], s)
|
||||
if err != nil {
|
||||
return char, 0, false, errors.Trace(err)
|
||||
}
|
||||
if size != 2 && size != 4 {
|
||||
// The unicode must can be represented in 2 bytes or 4 bytes.
|
||||
return char, size, false, errors.Errorf("Invalid unicode length: %d", size)
|
||||
}
|
||||
|
||||
unicode := binary.BigEndian.Uint16(char[0:2])
|
||||
size = utf8.RuneLen(rune(unicode))
|
||||
utf8.EncodeRune(char[0:size], rune(unicode))
|
||||
r1 := rune(binary.BigEndian.Uint16(char[0:2]))
|
||||
if size == 4 {
|
||||
r1 = utf16.DecodeRune(r1, rune(binary.BigEndian.Uint16(char[2:4])))
|
||||
}
|
||||
size = utf8.RuneLen(r1)
|
||||
if size < 0 {
|
||||
if r1 >= 0xD800 && r1 <= 0xDFFF {
|
||||
inSurrogateRange = true
|
||||
}
|
||||
return char, size, inSurrogateRange, errors.Errorf("Invalid unicode: %s", s)
|
||||
}
|
||||
utf8.EncodeRune(char[0:size], r1)
|
||||
return
|
||||
}
|
||||
|
||||
|
||||
@ -20,17 +20,73 @@ import (
|
||||
)
|
||||
|
||||
func TestDecodeEscapedUnicode(t *testing.T) {
|
||||
in := "597d"
|
||||
r, size, err := decodeEscapedUnicode([]byte(in))
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, "好\x00", string(r[:]))
|
||||
require.Equal(t, 3, size)
|
||||
testCases := []struct {
|
||||
input string
|
||||
expectedResult string
|
||||
size int
|
||||
inSurrogateRange bool
|
||||
expectedValid bool
|
||||
}{
|
||||
{"597d", "好\x00", 3, false, true},
|
||||
{"fffd", "�\x00", 3, false, true},
|
||||
{"D83DDE0A", "😊", 4, false, true},
|
||||
{"D83D", "", 0, true, false},
|
||||
{"D83D11", "", 0, false, false},
|
||||
{"ZZZZ", "", 0, false, false},
|
||||
{"D83DDE0A597d", "", 0, false, false},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
result, size, inSurrogateRange, err := decodeOneEscapedUnicode([]byte(tc.input))
|
||||
require.Equal(t, tc.inSurrogateRange, inSurrogateRange)
|
||||
if tc.expectedValid {
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, tc.expectedResult, string(result[:]))
|
||||
require.Equal(t, tc.size, size)
|
||||
} else {
|
||||
require.Error(t, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestUnquoteJSONString(t *testing.T) {
|
||||
var testCases = []struct {
|
||||
input string
|
||||
expectedResult string
|
||||
expectedValid bool
|
||||
}{
|
||||
{"\\b", "\b", true},
|
||||
{"\\f", "\f", true},
|
||||
{"\\n", "\n", true},
|
||||
{"\\r", "\r", true},
|
||||
{"\\t", "\t", true},
|
||||
{"\\\\", "\\", true},
|
||||
{"\\u597d", "好", true},
|
||||
{"0\\u597d0", "0好0", true},
|
||||
{"\\a", "a", true},
|
||||
{"[", "[", true},
|
||||
{"\\ud83e\\udd21", "🤡", true},
|
||||
{"\\ufffd", "�", true},
|
||||
// invalid input
|
||||
{"\\", "", false},
|
||||
{"\\u59", "", false},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
result, err := unquoteJSONString(tc.input)
|
||||
if tc.expectedValid {
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, tc.expectedResult, result)
|
||||
} else {
|
||||
require.Error(t, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkDecodeEscapedUnicode(b *testing.B) {
|
||||
for i := 0; i < b.N; i++ {
|
||||
in := "597d"
|
||||
_, _, _ = decodeEscapedUnicode([]byte(in))
|
||||
_, _, _, _ = decodeOneEscapedUnicode([]byte(in))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
16
tests/integrationtest/r/types/json_binary_functions.result
Normal file
16
tests/integrationtest/r/types/json_binary_functions.result
Normal file
@ -0,0 +1,16 @@
|
||||
drop table if exists t;
|
||||
CREATE TABLE `t` (
|
||||
`id` int NOT NULL AUTO_INCREMENT,
|
||||
`value` json DEFAULT NULL,
|
||||
`value_custom` json DEFAULT NULL,
|
||||
PRIMARY KEY (`id`)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci;
|
||||
INSERT INTO `t` (`value`, `value_custom`) VALUES ('{\"emo\\ud83e\\udd21\'ji\": \"some value\", \"escape\\uffff\'seq\'\\uffffue\\uffff\'nce\": \"some value\"}', NULL);
|
||||
SELECT 1 FROM `t` WHERE JSON_CONTAINS_PATH(`t`.`value`, 'one', '$."emo\\ud83e\\udd21\'ji"');
|
||||
1
|
||||
1
|
||||
SELECT 1 FROM `t` WHERE JSON_CONTAINS_PATH(`t`.`value`, 'one', '$."escape\\uffff\'seq\'\\uffffue\\uffff\'nce"');
|
||||
1
|
||||
1
|
||||
SELECT 1 FROM `t` WHERE JSON_CONTAINS_PATH(`t`.`value`, 'one', '$."escape\\uffff\'seq\'\\ufffdue\\uffff\'nce"');
|
||||
1
|
||||
14
tests/integrationtest/t/types/json_binary_functions.test
Normal file
14
tests/integrationtest/t/types/json_binary_functions.test
Normal file
@ -0,0 +1,14 @@
|
||||
drop table if exists t;
|
||||
CREATE TABLE `t` (
|
||||
`id` int NOT NULL AUTO_INCREMENT,
|
||||
`value` json DEFAULT NULL,
|
||||
`value_custom` json DEFAULT NULL,
|
||||
PRIMARY KEY (`id`)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci;
|
||||
INSERT INTO `t` (`value`, `value_custom`) VALUES ('{\"emo\\ud83e\\udd21\'ji\": \"some value\", \"escape\\uffff\'seq\'\\uffffue\\uffff\'nce\": \"some value\"}', NULL);
|
||||
|
||||
SELECT 1 FROM `t` WHERE JSON_CONTAINS_PATH(`t`.`value`, 'one', '$."emo\\ud83e\\udd21\'ji"');
|
||||
|
||||
SELECT 1 FROM `t` WHERE JSON_CONTAINS_PATH(`t`.`value`, 'one', '$."escape\\uffff\'seq\'\\uffffue\\uffff\'nce"');
|
||||
|
||||
SELECT 1 FROM `t` WHERE JSON_CONTAINS_PATH(`t`.`value`, 'one', '$."escape\\uffff\'seq\'\\ufffdue\\uffff\'nce"');
|
||||
Reference in New Issue
Block a user