500 lines
19 KiB
Go
500 lines
19 KiB
Go
// Copyright 2020 PingCAP, Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package expression
|
|
|
|
import (
|
|
"github.com/pingcap/tidb/parser/ast"
|
|
"github.com/pingcap/tidb/parser/charset"
|
|
"github.com/pingcap/tidb/parser/mysql"
|
|
"github.com/pingcap/tidb/sessionctx"
|
|
"github.com/pingcap/tidb/types"
|
|
"github.com/pingcap/tidb/util/chunk"
|
|
"github.com/pingcap/tidb/util/collate"
|
|
"github.com/pingcap/tidb/util/hack"
|
|
"github.com/pingcap/tidb/util/logutil"
|
|
)
|
|
|
|
// ExprCollation is a struct that store the collation related information.
|
|
type ExprCollation struct {
|
|
Coer Coercibility
|
|
Repe Repertoire
|
|
Charset string
|
|
Collation string
|
|
}
|
|
|
|
type collationInfo struct {
|
|
coer Coercibility
|
|
coerInit bool
|
|
repertoire Repertoire
|
|
|
|
charset string
|
|
collation string
|
|
}
|
|
|
|
func (c *collationInfo) HasCoercibility() bool {
|
|
return c.coerInit
|
|
}
|
|
|
|
func (c *collationInfo) Coercibility() Coercibility {
|
|
return c.coer
|
|
}
|
|
|
|
// SetCoercibility implements CollationInfo SetCoercibility interface.
|
|
func (c *collationInfo) SetCoercibility(val Coercibility) {
|
|
c.coer = val
|
|
c.coerInit = true
|
|
}
|
|
|
|
func (c *collationInfo) Repertoire() Repertoire {
|
|
return c.repertoire
|
|
}
|
|
|
|
func (c *collationInfo) SetRepertoire(r Repertoire) {
|
|
c.repertoire = r
|
|
}
|
|
|
|
func (c *collationInfo) SetCharsetAndCollation(chs, coll string) {
|
|
c.charset, c.collation = chs, coll
|
|
}
|
|
|
|
func (c *collationInfo) CharsetAndCollation() (string, string) {
|
|
return c.charset, c.collation
|
|
}
|
|
|
|
// CollationInfo contains all interfaces about dealing with collation.
|
|
type CollationInfo interface {
|
|
// HasCoercibility returns if the Coercibility value is initialized.
|
|
HasCoercibility() bool
|
|
|
|
// Coercibility returns the coercibility value which is used to check collations.
|
|
Coercibility() Coercibility
|
|
|
|
// SetCoercibility sets a specified coercibility for this expression.
|
|
SetCoercibility(val Coercibility)
|
|
|
|
// Repertoire returns the repertoire value which is used to check collations.
|
|
Repertoire() Repertoire
|
|
|
|
// SetRepertoire sets a specified repertoire for this expression.
|
|
SetRepertoire(r Repertoire)
|
|
|
|
// CharsetAndCollation gets charset and collation.
|
|
CharsetAndCollation() (string, string)
|
|
|
|
// SetCharsetAndCollation sets charset and collation.
|
|
SetCharsetAndCollation(chs, coll string)
|
|
}
|
|
|
|
// Coercibility values are used to check whether the collation of one item can be coerced to
|
|
// the collation of other. See https://dev.mysql.com/doc/refman/8.0/en/charset-collation-coercibility.html
|
|
type Coercibility int
|
|
|
|
const (
|
|
// CoercibilityExplicit is derived from an explicit COLLATE clause.
|
|
CoercibilityExplicit Coercibility = 0
|
|
// CoercibilityNone is derived from the concatenation of two strings with different collations.
|
|
CoercibilityNone Coercibility = 1
|
|
// CoercibilityImplicit is derived from a column or a stored routine parameter or local variable or cast() function.
|
|
CoercibilityImplicit Coercibility = 2
|
|
// CoercibilitySysconst is derived from a “system constant” (the string returned by functions such as USER() or VERSION()).
|
|
CoercibilitySysconst Coercibility = 3
|
|
// CoercibilityCoercible is derived from a literal.
|
|
CoercibilityCoercible Coercibility = 4
|
|
// CoercibilityNumeric is derived from a numeric or temporal value.
|
|
CoercibilityNumeric Coercibility = 5
|
|
// CoercibilityIgnorable is derived from NULL or an expression that is derived from NULL.
|
|
CoercibilityIgnorable Coercibility = 6
|
|
)
|
|
|
|
var (
|
|
// CollationStrictnessGroup group collation by strictness
|
|
CollationStrictnessGroup = map[string]int{
|
|
"utf8_general_ci": 1,
|
|
"utf8mb4_general_ci": 1,
|
|
"utf8_unicode_ci": 2,
|
|
"utf8mb4_unicode_ci": 2,
|
|
charset.CollationASCII: 3,
|
|
charset.CollationLatin1: 3,
|
|
charset.CollationUTF8: 3,
|
|
charset.CollationUTF8MB4: 3,
|
|
charset.CollationBin: 4,
|
|
}
|
|
|
|
// CollationStrictness indicates the strictness of comparison of the collation. The unequal order in a weak collation also holds in a strict collation.
|
|
// For example, if a != b in a weak collation(e.g. general_ci), then there must be a != b in a strict collation(e.g. _bin).
|
|
// collation group id in value is stricter than collation group id in key
|
|
CollationStrictness = map[int][]int{
|
|
1: {3, 4},
|
|
2: {3, 4},
|
|
3: {4},
|
|
4: {},
|
|
}
|
|
)
|
|
|
|
// The Repertoire of a character set is the collection of characters in the set.
|
|
// See https://dev.mysql.com/doc/refman/8.0/en/charset-repertoire.html.
|
|
// Only String expression has Repertoire, for non-string expression, it does not matter what the value it is.
|
|
type Repertoire int
|
|
|
|
const (
|
|
// ASCII is pure ASCII U+0000..U+007F.
|
|
ASCII Repertoire = 0x01
|
|
// EXTENDED is extended characters: U+0080..U+FFFF
|
|
EXTENDED = ASCII << 1
|
|
// UNICODE is ASCII | EXTENDED
|
|
UNICODE = ASCII | EXTENDED
|
|
)
|
|
|
|
func deriveCoercibilityForScarlarFunc(sf *ScalarFunction) Coercibility {
|
|
panic("this function should never be called")
|
|
}
|
|
|
|
func deriveCoercibilityForConstant(c *Constant) Coercibility {
|
|
if c.Value.IsNull() {
|
|
return CoercibilityIgnorable
|
|
} else if c.RetType.EvalType() != types.ETString {
|
|
return CoercibilityNumeric
|
|
}
|
|
return CoercibilityCoercible
|
|
}
|
|
|
|
func deriveCoercibilityForColumn(c *Column) Coercibility {
|
|
// For specified type null, it should return CoercibilityIgnorable, which means it got the lowest priority in DeriveCollationFromExprs.
|
|
if c.RetType.Tp == mysql.TypeNull {
|
|
return CoercibilityIgnorable
|
|
}
|
|
if c.RetType.EvalType() != types.ETString {
|
|
return CoercibilityNumeric
|
|
}
|
|
return CoercibilityImplicit
|
|
}
|
|
|
|
func deriveCollation(ctx sessionctx.Context, funcName string, args []Expression, retType types.EvalType, argTps ...types.EvalType) (ec *ExprCollation, err error) {
|
|
switch funcName {
|
|
case ast.Concat, ast.ConcatWS, ast.Lower, ast.Lcase, ast.Reverse, ast.Upper, ast.Ucase, ast.Quote, ast.Coalesce:
|
|
return CheckAndDeriveCollationFromExprs(ctx, funcName, retType, args...)
|
|
case ast.Left, ast.Right, ast.Repeat, ast.Trim, ast.LTrim, ast.RTrim, ast.Substr, ast.SubstringIndex, ast.Replace, ast.Substring, ast.Mid, ast.Translate:
|
|
return CheckAndDeriveCollationFromExprs(ctx, funcName, retType, args[0])
|
|
case ast.InsertFunc:
|
|
return CheckAndDeriveCollationFromExprs(ctx, funcName, retType, args[0], args[3])
|
|
case ast.Lpad, ast.Rpad:
|
|
return CheckAndDeriveCollationFromExprs(ctx, funcName, retType, args[0], args[2])
|
|
case ast.Elt, ast.ExportSet, ast.MakeSet:
|
|
return CheckAndDeriveCollationFromExprs(ctx, funcName, retType, args[1:]...)
|
|
case ast.FindInSet, ast.Regexp:
|
|
return CheckAndDeriveCollationFromExprs(ctx, funcName, types.ETInt, args...)
|
|
case ast.Field:
|
|
if argTps[0] == types.ETString {
|
|
return CheckAndDeriveCollationFromExprs(ctx, funcName, retType, args...)
|
|
}
|
|
case ast.Locate, ast.Instr, ast.Position:
|
|
return CheckAndDeriveCollationFromExprs(ctx, funcName, retType, args[0], args[1])
|
|
case ast.GE, ast.LE, ast.GT, ast.LT, ast.EQ, ast.NE, ast.NullEQ, ast.Strcmp:
|
|
// if compare type is string, we should determine which collation should be used.
|
|
if argTps[0] == types.ETString {
|
|
ec, err = CheckAndDeriveCollationFromExprs(ctx, funcName, types.ETInt, args...)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
ec.Coer = CoercibilityNumeric
|
|
ec.Repe = ASCII
|
|
return ec, nil
|
|
}
|
|
case ast.If:
|
|
return CheckAndDeriveCollationFromExprs(ctx, funcName, retType, args[1], args[2])
|
|
case ast.Ifnull:
|
|
return CheckAndDeriveCollationFromExprs(ctx, funcName, retType, args[0], args[1])
|
|
case ast.Like:
|
|
ec, err = CheckAndDeriveCollationFromExprs(ctx, funcName, types.ETInt, args[0], args[1])
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
ec.Coer = CoercibilityNumeric
|
|
ec.Repe = ASCII
|
|
return ec, nil
|
|
case ast.In:
|
|
if args[0].GetType().EvalType() == types.ETString {
|
|
return CheckAndDeriveCollationFromExprs(ctx, funcName, types.ETInt, args...)
|
|
}
|
|
case ast.DateFormat, ast.TimeFormat:
|
|
charsetInfo, collation := ctx.GetSessionVars().GetCharsetInfo()
|
|
return &ExprCollation{args[1].Coercibility(), args[1].Repertoire(), charsetInfo, collation}, nil
|
|
case ast.Cast:
|
|
// We assume all the cast are implicit.
|
|
ec = &ExprCollation{args[0].Coercibility(), args[0].Repertoire(), args[0].GetType().Charset, args[0].GetType().Collate}
|
|
// Non-string type cast to string type should use @@character_set_connection and @@collation_connection.
|
|
// String type cast to string type should keep its original charset and collation. It should not happen.
|
|
if retType == types.ETString && argTps[0] != types.ETString {
|
|
ec.Charset, ec.Collation = ctx.GetSessionVars().GetCharsetInfo()
|
|
}
|
|
return ec, nil
|
|
case ast.Case:
|
|
// FIXME: case function aggregate collation is not correct.
|
|
// We should only aggregate the `then expression`,
|
|
// case ... when ... expression will be rewritten to:
|
|
// args: eq scalar func(args: value, condition1), result1,
|
|
// eq scalar func(args: value, condition2), result2,
|
|
// ...
|
|
// else clause
|
|
// Or
|
|
// args: condition1, result1,
|
|
// condition2, result2,
|
|
// ...
|
|
// else clause
|
|
// so, arguments with odd index are the `then expression`.
|
|
if argTps[1] == types.ETString {
|
|
fieldArgs := make([]Expression, 0)
|
|
for i := 1; i < len(args); i += 2 {
|
|
fieldArgs = append(fieldArgs, args[i])
|
|
}
|
|
if len(args)%2 == 1 {
|
|
fieldArgs = append(fieldArgs, args[len(args)-1])
|
|
}
|
|
return CheckAndDeriveCollationFromExprs(ctx, funcName, retType, fieldArgs...)
|
|
}
|
|
case ast.Database, ast.User, ast.CurrentUser, ast.Version, ast.CurrentRole, ast.TiDBVersion:
|
|
chs, coll := charset.GetDefaultCharsetAndCollate()
|
|
return &ExprCollation{CoercibilitySysconst, UNICODE, chs, coll}, nil
|
|
case ast.Format, ast.Space, ast.ToBase64, ast.UUID, ast.Hex, ast.MD5, ast.SHA, ast.SHA2:
|
|
// should return ASCII repertoire, MySQL's doc says it depends on character_set_connection, but it not true from its source code.
|
|
ec = &ExprCollation{Coer: CoercibilityCoercible, Repe: ASCII}
|
|
ec.Charset, ec.Collation = ctx.GetSessionVars().GetCharsetInfo()
|
|
return ec, nil
|
|
}
|
|
|
|
ec = &ExprCollation{CoercibilityNumeric, ASCII, charset.CharsetBin, charset.CollationBin}
|
|
if retType == types.ETString {
|
|
ec.Charset, ec.Collation = ctx.GetSessionVars().GetCharsetInfo()
|
|
ec.Coer = CoercibilityCoercible
|
|
if ec.Charset != charset.CharsetASCII {
|
|
ec.Repe = UNICODE
|
|
}
|
|
}
|
|
return ec, nil
|
|
}
|
|
|
|
// DeriveCollationFromExprs derives collation information from these expressions.
|
|
// Deprecated, use CheckAndDeriveCollationFromExprs instead.
|
|
// TODO: remove this function after the all usage is replaced by CheckAndDeriveCollationFromExprs
|
|
func DeriveCollationFromExprs(ctx sessionctx.Context, exprs ...Expression) (dstCharset, dstCollation string) {
|
|
collation := inferCollation(exprs...)
|
|
return collation.Charset, collation.Collation
|
|
}
|
|
|
|
// CheckAndDeriveCollationFromExprs derives collation information from these expressions, return error if derives collation error.
|
|
func CheckAndDeriveCollationFromExprs(ctx sessionctx.Context, funcName string, evalType types.EvalType, args ...Expression) (et *ExprCollation, err error) {
|
|
ec := inferCollation(args...)
|
|
if ec == nil {
|
|
return nil, illegalMixCollationErr(funcName, args)
|
|
}
|
|
|
|
if evalType != types.ETString && ec.Coer == CoercibilityNone {
|
|
return nil, illegalMixCollationErr(funcName, args)
|
|
}
|
|
|
|
if evalType == types.ETString && ec.Coer == CoercibilityNumeric {
|
|
ec.Charset, ec.Collation = ctx.GetSessionVars().GetCharsetInfo()
|
|
ec.Coer = CoercibilityCoercible
|
|
ec.Repe = ASCII
|
|
}
|
|
|
|
if !safeConvert(ctx, ec, args...) {
|
|
return nil, illegalMixCollationErr(funcName, args)
|
|
}
|
|
|
|
return ec, nil
|
|
}
|
|
|
|
func safeConvert(ctx sessionctx.Context, ec *ExprCollation, args ...Expression) bool {
|
|
enc := charset.FindEncodingTakeUTF8AsNoop(ec.Charset)
|
|
for _, arg := range args {
|
|
if arg.GetType().Charset == ec.Charset {
|
|
continue
|
|
}
|
|
|
|
// If value has ASCII repertoire, or it is binary string, just skip it.
|
|
if arg.Repertoire() == ASCII || types.IsBinaryStr(arg.GetType()) {
|
|
continue
|
|
}
|
|
|
|
if c, ok := arg.(*Constant); ok {
|
|
str, isNull, err := c.EvalString(ctx, chunk.Row{})
|
|
if err != nil {
|
|
return false
|
|
}
|
|
if isNull {
|
|
continue
|
|
}
|
|
if !enc.IsValid(hack.Slice(str)) {
|
|
return false
|
|
}
|
|
} else {
|
|
if arg.GetType().Collate != charset.CharsetBin && ec.Charset != charset.CharsetBin && !isUnicodeCollation(ec.Charset) {
|
|
return false
|
|
}
|
|
}
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
// inferCollation infers collation, charset, coercibility and check the legitimacy.
|
|
func inferCollation(exprs ...Expression) *ExprCollation {
|
|
if len(exprs) == 0 {
|
|
// TODO: see if any function with no arguments could run here.
|
|
dstCharset, dstCollation := charset.GetDefaultCharsetAndCollate()
|
|
return &ExprCollation{
|
|
Coer: CoercibilityIgnorable,
|
|
Repe: UNICODE,
|
|
Charset: dstCharset,
|
|
Collation: dstCollation,
|
|
}
|
|
}
|
|
|
|
repertoire := exprs[0].Repertoire()
|
|
coercibility := exprs[0].Coercibility()
|
|
dstCharset, dstCollation := exprs[0].GetType().Charset, exprs[0].GetType().Collate
|
|
unknownCS := false
|
|
|
|
// Aggregate arguments one by one, agg(a, b, c) := agg(agg(a, b), c).
|
|
for _, arg := range exprs[1:] {
|
|
// If one of the arguments is binary charset, we allow it can be used with other charsets.
|
|
// If they have the same coercibility, let the binary charset one to be the winner because binary has more precedence.
|
|
if dstCollation == charset.CollationBin || arg.GetType().Collate == charset.CollationBin {
|
|
if coercibility > arg.Coercibility() || (coercibility == arg.Coercibility() && arg.GetType().Collate == charset.CollationBin) {
|
|
coercibility, dstCharset, dstCollation = arg.Coercibility(), arg.GetType().Charset, arg.GetType().Collate
|
|
}
|
|
repertoire |= arg.Repertoire()
|
|
continue
|
|
}
|
|
|
|
// If charset is different, only if conversion without data loss is allowed:
|
|
// 1. ASCII repertoire is always convertible.
|
|
// 2. Non-Unicode charset can convert to Unicode charset.
|
|
// 3. utf8 can convert to utf8mb4.
|
|
// 4. constant value is allowed because we can eval and convert it directly.
|
|
// If we can not aggregate these two collations, we will get CoercibilityNone and wait for an explicit COLLATE clause, if
|
|
// there is no explicit COLLATE clause, we will get an error.
|
|
if dstCharset != arg.GetType().Charset {
|
|
switch {
|
|
case coercibility < arg.Coercibility():
|
|
if arg.Repertoire() == ASCII || arg.Coercibility() >= CoercibilitySysconst || isUnicodeCollation(dstCharset) {
|
|
repertoire |= arg.Repertoire()
|
|
continue
|
|
}
|
|
case coercibility == arg.Coercibility():
|
|
if (isUnicodeCollation(dstCharset) && !isUnicodeCollation(arg.GetType().Charset)) || (dstCharset == charset.CharsetUTF8MB4 && arg.GetType().Charset == charset.CharsetUTF8) {
|
|
repertoire |= arg.Repertoire()
|
|
continue
|
|
} else if (isUnicodeCollation(arg.GetType().Charset) && !isUnicodeCollation(dstCharset)) || (arg.GetType().Charset == charset.CharsetUTF8MB4 && dstCharset == charset.CharsetUTF8) {
|
|
coercibility, dstCharset, dstCollation = arg.Coercibility(), arg.GetType().Charset, arg.GetType().Collate
|
|
repertoire |= arg.Repertoire()
|
|
continue
|
|
} else if repertoire == ASCII && arg.Repertoire() != ASCII {
|
|
coercibility, dstCharset, dstCollation = arg.Coercibility(), arg.GetType().Charset, arg.GetType().Collate
|
|
repertoire |= arg.Repertoire()
|
|
continue
|
|
} else if repertoire != ASCII && arg.Repertoire() == ASCII {
|
|
repertoire |= arg.Repertoire()
|
|
continue
|
|
}
|
|
case coercibility > arg.Coercibility():
|
|
if repertoire == ASCII || coercibility >= CoercibilitySysconst || isUnicodeCollation(arg.GetType().Charset) {
|
|
coercibility, dstCharset, dstCollation = arg.Coercibility(), arg.GetType().Charset, arg.GetType().Collate
|
|
repertoire |= arg.Repertoire()
|
|
continue
|
|
}
|
|
}
|
|
|
|
// Cannot apply conversion.
|
|
repertoire |= arg.Repertoire()
|
|
coercibility, dstCharset, dstCollation = CoercibilityNone, charset.CharsetBin, charset.CollationBin
|
|
unknownCS = true
|
|
} else {
|
|
// If charset is the same, use lower coercibility, if coercibility is the same and none of them are _bin,
|
|
// derive to CoercibilityNone and _bin collation.
|
|
switch {
|
|
case coercibility == arg.Coercibility():
|
|
if dstCollation == arg.GetType().Collate {
|
|
} else if coercibility == CoercibilityExplicit {
|
|
return nil
|
|
} else if isBinCollation(dstCollation) {
|
|
} else if isBinCollation(arg.GetType().Collate) {
|
|
coercibility, dstCharset, dstCollation = arg.Coercibility(), arg.GetType().Charset, arg.GetType().Collate
|
|
} else {
|
|
coercibility, dstCollation, dstCharset = CoercibilityNone, getBinCollation(arg.GetType().Charset), arg.GetType().Charset
|
|
}
|
|
case coercibility > arg.Coercibility():
|
|
coercibility, dstCharset, dstCollation = arg.Coercibility(), arg.GetType().Charset, arg.GetType().Collate
|
|
}
|
|
repertoire |= arg.Repertoire()
|
|
}
|
|
}
|
|
|
|
if unknownCS && coercibility != CoercibilityExplicit {
|
|
return nil
|
|
}
|
|
|
|
return &ExprCollation{
|
|
Coer: coercibility,
|
|
Repe: repertoire,
|
|
Charset: dstCharset,
|
|
Collation: dstCollation,
|
|
}
|
|
}
|
|
|
|
func isUnicodeCollation(ch string) bool {
|
|
return ch == charset.CharsetUTF8 || ch == charset.CharsetUTF8MB4
|
|
}
|
|
|
|
func isBinCollation(collate string) bool {
|
|
return collate == charset.CollationASCII || collate == charset.CollationLatin1 ||
|
|
collate == charset.CollationUTF8 || collate == charset.CollationUTF8MB4 ||
|
|
collate == charset.CollationGBKBin
|
|
}
|
|
|
|
// getBinCollation get binary collation by charset
|
|
func getBinCollation(cs string) string {
|
|
switch cs {
|
|
case charset.CharsetUTF8:
|
|
return charset.CollationUTF8
|
|
case charset.CharsetUTF8MB4:
|
|
return charset.CollationUTF8MB4
|
|
case charset.CharsetGBK:
|
|
return charset.CollationGBKBin
|
|
}
|
|
|
|
logutil.BgLogger().Error("unexpected charset " + cs)
|
|
// it must return something, never reachable
|
|
return charset.CollationUTF8MB4
|
|
}
|
|
|
|
var (
|
|
coerString = []string{"EXPLICIT", "NONE", "IMPLICIT", "SYSCONST", "COERCIBLE", "NUMERIC", "IGNORABLE"}
|
|
)
|
|
|
|
func illegalMixCollationErr(funcName string, args []Expression) error {
|
|
funcName = GetDisplayName(funcName)
|
|
|
|
switch len(args) {
|
|
case 2:
|
|
return collate.ErrIllegalMix2Collation.GenWithStackByArgs(args[0].GetType().Collate, coerString[args[0].Coercibility()], args[1].GetType().Collate, coerString[args[1].Coercibility()], funcName)
|
|
case 3:
|
|
return collate.ErrIllegalMix3Collation.GenWithStackByArgs(args[0].GetType().Collate, coerString[args[0].Coercibility()], args[1].GetType().Collate, coerString[args[1].Coercibility()], args[2].GetType().Collate, coerString[args[2].Coercibility()], funcName)
|
|
default:
|
|
return collate.ErrIllegalMixCollation.GenWithStackByArgs(funcName)
|
|
}
|
|
}
|