Files
tidb/parser/charset/encoding.go

198 lines
5.6 KiB
Go

// Copyright 2021 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// See the License for the specific language governing permissions and
// limitations under the License.
package charset
import (
"bytes"
"fmt"
"reflect"
"strings"
"unicode"
"unsafe"
"github.com/cznic/mathutil"
"github.com/pingcap/tidb/parser/mysql"
"github.com/pingcap/tidb/parser/terror"
"golang.org/x/text/encoding"
"golang.org/x/text/transform"
)
const encodingLegacy = "utf-8" // utf-8 encoding is compatible with old default behavior.
var errInvalidCharacterString = terror.ClassParser.NewStd(mysql.ErrInvalidCharacterString)
type EncodingLabel string
// Format trim and change the label to lowercase.
func Format(label string) EncodingLabel {
return EncodingLabel(strings.ToLower(strings.Trim(label, "\t\n\r\f ")))
}
// Formatted is used when the label is already trimmed and it is lowercase.
func Formatted(label string) EncodingLabel {
return EncodingLabel(label)
}
// Encoding provide a interface to encode/decode a string with specific encoding.
type Encoding struct {
enc encoding.Encoding
name string
charLength func([]byte) int
specialCase unicode.SpecialCase
}
// enabled indicates whether the non-utf8 encoding is used.
func (e *Encoding) enabled() bool {
return e.enc != nil && e.charLength != nil
}
// Name returns the name of the current encoding.
func (e *Encoding) Name() string {
return e.name
}
// NewEncoding creates a new Encoding.
func NewEncoding(label string) *Encoding {
if len(label) == 0 {
return &Encoding{}
}
e, name := Lookup(label)
if e != nil && name != encodingLegacy {
return &Encoding{
enc: e,
name: name,
charLength: FindNextCharacterLength(name),
specialCase: LookupSpecialCase(name),
}
}
return &Encoding{name: name}
}
// UpdateEncoding updates to a new Encoding.
func (e *Encoding) UpdateEncoding(label EncodingLabel) {
enc, name := lookup(label)
e.name = name
if enc != nil && name != encodingLegacy {
e.enc = enc
e.charLength = FindNextCharacterLength(name)
} else {
e.enc = nil
e.charLength = nil
}
e.specialCase = LookupSpecialCase(e.name)
}
// Encode convert bytes from utf-8 charset to a specific charset.
func (e *Encoding) Encode(dest, src []byte) ([]byte, error) {
if !e.enabled() {
return src, nil
}
return e.transform(e.enc.NewEncoder(), dest, src, false)
}
// EncodeString convert a string from utf-8 charset to a specific charset.
func (e *Encoding) EncodeString(src string) (string, error) {
if !e.enabled() {
return src, nil
}
bs, err := e.transform(e.enc.NewEncoder(), nil, Slice(src), false)
return string(bs), err
}
// Decode convert bytes from a specific charset to utf-8 charset.
func (e *Encoding) Decode(dest, src []byte) ([]byte, error) {
if !e.enabled() {
return src, nil
}
return e.transform(e.enc.NewDecoder(), dest, src, true)
}
// DecodeString convert a string from a specific charset to utf-8 charset.
func (e *Encoding) DecodeString(src string) (string, error) {
if !e.enabled() {
return src, nil
}
bs, err := e.transform(e.enc.NewDecoder(), nil, Slice(src), true)
return string(bs), err
}
func (e *Encoding) transform(transformer transform.Transformer, dest, src []byte, isDecoding bool) ([]byte, error) {
if len(dest) < len(src) {
dest = make([]byte, len(src)*2)
}
var destOffset, srcOffset int
var encodingErr error
for {
srcNextLen := e.nextCharLenInSrc(src[srcOffset:], isDecoding)
srcEnd := mathutil.Min(srcOffset+srcNextLen, len(src))
nDest, nSrc, err := transformer.Transform(dest[destOffset:], src[srcOffset:srcEnd], false)
if err == transform.ErrShortDst {
dest = enlargeCapacity(dest)
} else if err != nil || isDecoding && beginWithReplacementChar(dest[destOffset:destOffset+nDest]) {
if encodingErr == nil {
encodingErr = e.generateErr(src[srcOffset:], srcNextLen)
}
dest[destOffset] = byte('?')
nDest, nSrc = 1, srcNextLen // skip the source bytes that cannot be decoded normally.
}
destOffset += nDest
srcOffset += nSrc
// The source bytes are exhausted.
if srcOffset >= len(src) {
return dest[:destOffset], encodingErr
}
}
}
func (e *Encoding) nextCharLenInSrc(srcRest []byte, isDecoding bool) int {
if isDecoding {
if e.charLength != nil {
return e.charLength(srcRest)
}
return len(srcRest)
}
return characterLengthUTF8(srcRest)
}
func enlargeCapacity(dest []byte) []byte {
newDest := make([]byte, len(dest)*2)
copy(newDest, dest)
return newDest
}
func (e *Encoding) generateErr(srcRest []byte, srcNextLen int) error {
cutEnd := mathutil.Min(srcNextLen, len(srcRest))
invalidBytes := fmt.Sprintf("%X", string(srcRest[:cutEnd]))
return errInvalidCharacterString.GenWithStackByArgs(e.name, invalidBytes)
}
// replacementBytes are bytes for the replacement rune 0xfffd.
var replacementBytes = []byte{0xEF, 0xBF, 0xBD}
// beginWithReplacementChar check if dst has the prefix '0xEFBFBD'.
func beginWithReplacementChar(dst []byte) bool {
return bytes.HasPrefix(dst, replacementBytes)
}
// Slice converts string to slice without copy.
// Use at your own risk.
func Slice(s string) (b []byte) {
pBytes := (*reflect.SliceHeader)(unsafe.Pointer(&b))
pString := (*reflect.StringHeader)(unsafe.Pointer(&s))
pBytes.Data = pString.Data
pBytes.Len = pString.Len
pBytes.Cap = pString.Len
return
}