Files
tidb/pkg/parser/digester.go

689 lines
17 KiB
Go

// Copyright 2019 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// See the License for the specific language governing permissions and
// limitations under the License.
package parser
import (
"bytes"
"crypto/sha256"
"encoding/hex"
hash2 "hash"
"strings"
"sync"
"unsafe"
"github.com/pingcap/errors"
"github.com/pingcap/tidb/pkg/parser/charset"
)
// Digest stores the fixed length hash value.
type Digest struct {
b []byte
str string
}
// NewDigest returns a new digest.
func NewDigest(b []byte) *Digest {
return &Digest{
b: b,
str: hex.EncodeToString(b),
}
}
// String returns the digest hex string.
func (d *Digest) String() string {
return d.str
}
// Bytes returns the digest byte slice.
func (d *Digest) Bytes() []byte {
return d.b
}
// DigestHash generates the digest of statements.
// it will generate a hash on normalized form of statement text
// which removes general property of a statement but keeps specific property.
//
// for example: both DigestHash('select 1') and DigestHash('select 2') => e1c71d1661ae46e09b7aaec1c390957f0d6260410df4e4bc71b9c8d681021471
//
// Deprecated: It is logically consistent with NormalizeDigest.
func DigestHash(sql string) (digest *Digest) {
d := digesterPool.Get().(*sqlDigester)
digest = d.doDigest(sql)
digesterPool.Put(d)
return
}
// DigestNormalized generates the digest of a normalized sql.
// it will generate a hash on a normalized sql.
// Normalize + DigestNormalized equals to NormalizeDigest.
//
// for example: DigestNormalized('select ?')
// DigestNormalized should be called with a normalized SQL string (like 'select ?') generated by function Normalize.
// do not call with SQL which is not normalized, DigestNormalized('select 1') and DigestNormalized('select 2') is not the same
func DigestNormalized(normalized string) (digest *Digest) {
d := digesterPool.Get().(*sqlDigester)
digest = d.doDigestNormalized(normalized)
digesterPool.Put(d)
return
}
// Normalize generates the normalized statements.
// it will get normalized form of statement text
// which removes general property of a statement but keeps specific property.
// possible values for 'redact' is "OFF", "ON" or "MARKER". Passing "" is seen as "OFF".
//
// when "OFF", it is returned as is
// for example, when "ON": Normalize('select 1 from b where a = 1') => 'select ? from b where a = ?'
// for example, when "MARKER": Normalize('select 1 from b where a = 1') => 'select ‹1› from b where a = ‹1›'
func Normalize(sql string, redact string) (result string) {
if redact == "" || redact == errors.RedactLogDisable {
return sql
}
d := digesterPool.Get().(*sqlDigester)
result = d.doNormalize(sql, redact, false)
digesterPool.Put(d)
return
}
// NormalizeForBinding generates the normalized statements with additional binding rules
// it will get normalized form of statement text
// which removes general property of a statement but keeps specific property.
//
// for example: NormalizeForBinding('select 1 from b where a = 1') => 'select ? from b where a = ?'
func NormalizeForBinding(sql string, forPlanReplayerReload bool) (result string) {
d := digesterPool.Get().(*sqlDigester)
result = d.doNormalizeForBinding(sql, false, forPlanReplayerReload)
digesterPool.Put(d)
return
}
// NormalizeKeepHint generates the normalized statements, but keep the hints.
// it will get normalized form of statement text with hints.
// which removes general property of a statement but keeps specific property.
//
// for example: Normalize('select /*+ use_index(t, primary) */ 1 from b where a = 1') => 'select /*+ use_index(t, primary) */ ? from b where a = ?'
func NormalizeKeepHint(sql string) (result string) {
d := digesterPool.Get().(*sqlDigester)
result = d.doNormalize(sql, errors.RedactLogEnable, true)
digesterPool.Put(d)
return
}
// NormalizeDigest combines Normalize and DigestNormalized into one method.
func NormalizeDigest(sql string) (normalized string, digest *Digest) {
d := digesterPool.Get().(*sqlDigester)
normalized, digest = d.doNormalizeDigest(sql)
digesterPool.Put(d)
return
}
// NormalizeDigestForBinding combines Normalize and DigestNormalized into one method with additional binding rules.
func NormalizeDigestForBinding(sql string) (normalized string, digest *Digest) {
d := digesterPool.Get().(*sqlDigester)
normalized, digest = d.doNormalizeDigestForBinding(sql)
digesterPool.Put(d)
return
}
var digesterPool = sync.Pool{
New: func() interface{} {
return &sqlDigester{
lexer: NewScanner(""),
hasher: sha256.New(),
}
},
}
// sqlDigester is used to compute DigestHash or Normalize for sql.
type sqlDigester struct {
buffer bytes.Buffer
lexer *Scanner
hasher hash2.Hash
tokens tokenDeque
}
func (d *sqlDigester) doDigestNormalized(normalized string) (digest *Digest) {
b := unsafe.Slice(unsafe.StringData(normalized), len(normalized))
d.hasher.Write(b)
digest = NewDigest(d.hasher.Sum(nil))
d.hasher.Reset()
return
}
func (d *sqlDigester) doDigest(sql string) (digest *Digest) {
d.normalize(sql, errors.RedactLogEnable, false, false, false)
d.hasher.Write(d.buffer.Bytes())
d.buffer.Reset()
digest = NewDigest(d.hasher.Sum(nil))
d.hasher.Reset()
return
}
func (d *sqlDigester) doNormalize(sql string, redact string, keepHint bool) (result string) {
d.normalize(sql, redact, keepHint, false, false)
result = d.buffer.String()
d.buffer.Reset()
return
}
func (d *sqlDigester) doNormalizeForBinding(sql string, keepHint bool, forPlanReplayerReload bool) (result string) {
d.normalize(sql, errors.RedactLogEnable, keepHint, true, forPlanReplayerReload)
result = d.buffer.String()
d.buffer.Reset()
return
}
func (d *sqlDigester) doNormalizeDigest(sql string) (normalized string, digest *Digest) {
d.normalize(sql, errors.RedactLogEnable, false, false, false)
normalized = d.buffer.String()
d.hasher.Write(d.buffer.Bytes())
d.buffer.Reset()
digest = NewDigest(d.hasher.Sum(nil))
d.hasher.Reset()
return
}
func (d *sqlDigester) doNormalizeDigestForBinding(sql string) (normalized string, digest *Digest) {
d.normalize(sql, errors.RedactLogEnable, false, true, false)
normalized = d.buffer.String()
d.hasher.Write(d.buffer.Bytes())
d.buffer.Reset()
digest = NewDigest(d.hasher.Sum(nil))
d.hasher.Reset()
return
}
const (
// genericSymbol presents parameter holder ("?") in statement
// it can be any value as long as it is not repeated with other tokens.
genericSymbol = -1
// genericSymbolList presents parameter holder lists ("?, ?, ...") in statement
// it can be any value as long as it is not repeated with other tokens.
genericSymbolList = -2
)
func (d *sqlDigester) normalize(sql string, redact string, keepHint bool, forBinding bool, forPlanReplayerReload bool) {
d.lexer.reset(sql)
d.lexer.setKeepHint(keepHint)
for {
tok, pos, lit := d.lexer.scan()
if tok == invalid {
break
}
if pos.Offset == len(sql) || (pos.Offset == len(sql)-1 && sql[pos.Offset] == ';') {
break
}
currTok := token{tok, strings.ToLower(lit)}
if !keepHint && d.reduceOptimizerHint(&currTok) {
continue
}
d.reduceLit(&currTok, redact, forBinding, forPlanReplayerReload)
if forPlanReplayerReload {
// Apply for plan replayer to match specific rules, changing IN (...) to IN (?). This can avoid plan replayer load failures caused by parse errors.
d.replaceSingleLiteralWithInList(&currTok)
} else if forBinding {
// Apply binding matching specific rules, IN (?) => IN ( ... ) #44298
d.reduceInListWithSingleLiteral(&currTok)
// In (Row(...)) => In (...) #51222
d.reduceInRowListWithSingleLiteral(&currTok)
}
if currTok.tok == identifier {
if strings.HasPrefix(currTok.lit, "_") {
_, err := charset.GetCharsetInfo(currTok.lit[1:])
if err == nil {
currTok.tok = underscoreCS
goto APPEND
}
}
if tok1 := d.lexer.isTokenIdentifier(currTok.lit, pos.Offset); tok1 != 0 {
currTok.tok = tok1
}
}
APPEND:
d.tokens.pushBack(currTok)
}
d.lexer.reset("")
for i, token := range d.tokens {
if i > 0 {
d.buffer.WriteRune(' ')
}
if token.tok == singleAtIdentifier {
d.buffer.WriteString("@")
d.buffer.WriteString(token.lit)
} else if token.tok == underscoreCS {
d.buffer.WriteString("(_charset)")
} else if token.tok == identifier || token.tok == quotedIdentifier {
d.buffer.WriteByte('`')
d.buffer.WriteString(token.lit)
d.buffer.WriteByte('`')
} else {
d.buffer.WriteString(token.lit)
}
}
d.tokens.reset()
}
func (d *sqlDigester) reduceOptimizerHint(tok *token) (reduced bool) {
// ignore /*+..*/
if tok.tok == hintComment {
return
}
// ignore force/use/ignore index(x)
if tok.lit == "index" {
toks := d.tokens.back(1)
if len(toks) > 0 {
switch strings.ToLower(toks[0].lit) {
case "force", "use", "ignore":
for {
tok, _, lit := d.lexer.scan()
if (tok == 0 && d.lexer.r.eof()) || tok == invalid {
break
}
if lit == ")" {
reduced = true
d.tokens.popBack(1)
break
}
}
return
}
}
}
// ignore straight_join
if tok.lit == "straight_join" {
tok.lit = "join"
return
}
return
}
func (d *sqlDigester) reduceLit(currTok *token, redact string, forBinding bool, forPlanReplayer bool) {
if !d.isLit(*currTok) {
return
}
if redact == errors.RedactLogMarker && !forBinding && !forPlanReplayer {
switch currTok.lit {
case "?", "*":
return
}
input := currTok.lit
b := &strings.Builder{}
b.Grow(len(input))
_, _ = b.WriteRune('‹')
for _, c := range input {
if c == '‹' || c == '›' {
_, _ = b.WriteRune(c)
_, _ = b.WriteRune(c)
} else {
_, _ = b.WriteRune(c)
}
}
_, _ = b.WriteRune('›')
currTok.lit = b.String()
return
}
// count(*) => count(?)
if currTok.lit == "*" {
if d.isStarParam() {
currTok.tok = genericSymbol
currTok.lit = "?"
}
return
}
// "-x" or "+x" => "x"
if d.isPrefixByUnary(currTok.tok) {
d.tokens.popBack(1)
}
// "?, ?, ?, ?" => "..."
last2 := d.tokens.back(2)
if d.isGenericList(last2) {
d.tokens.popBack(2)
currTok.tok = genericSymbolList
currTok.lit = "..."
return
}
// "_charset ?, _charset ?," => "..."
last4 := d.tokens.back(4)
if toPop := d.isGenericListWithCharset(last4); toPop != 0 {
d.tokens.popBack(toPop)
currTok.tok = genericSymbolList
currTok.lit = "..."
return
}
// Aggressive reduce lists.
if d.isGenericLists(last4) {
d.tokens.popBack(4)
currTok.tok = genericSymbolList
currTok.lit = "..."
return
}
// reduce "In (row(...), row(...))" to "In (row(...))"
// final, it will be reduced to "In (...)". Issue: #51222
if forBinding {
last9 := d.tokens.back(9)
if d.isGenericRowListsWithIn(last9) {
d.tokens.popBack(5)
currTok.tok = genericSymbolList
currTok.lit = "..."
return
}
}
// order by n => order by n
if currTok.tok == intLit {
if d.isOrderOrGroupBy() {
return
}
}
// 2 => ?
currTok.tok = genericSymbol
currTok.lit = "?"
}
func (d *sqlDigester) isGenericLists(last4 []token) bool {
if len(last4) < 4 {
return false
}
if !(last4[0].tok == genericSymbol || last4[0].tok == genericSymbolList) {
return false
}
if last4[1].lit != ")" {
return false
}
if !d.isComma(last4[2]) {
return false
}
if last4[3].lit != "(" {
return false
}
return true
}
// In (Row(...), Row(...)) => In (Row(...))
func (d *sqlDigester) isGenericRowListsWithIn(last9 []token) bool {
if len(last9) < 7 {
return false
}
if !d.isInKeyword(last9[0]) {
return false
}
if last9[1].lit != "(" {
return false
}
if !d.isRowKeyword(last9[2]) {
return false
}
if last9[3].lit != "(" {
return false
}
if !(last9[4].tok == genericSymbol || last9[4].tok == genericSymbolList) {
return false
}
if last9[5].lit != ")" {
return false
}
if !d.isComma(last9[6]) {
return false
}
if !d.isRowKeyword(last9[7]) {
return false
}
if last9[8].lit != "(" {
return false
}
return true
}
// IN (...) => IN (?) Issue: #43192
func (d *sqlDigester) replaceSingleLiteralWithInList(currTok *token) {
last5 := d.tokens.back(5)
if len(last5) == 5 &&
d.isInKeyword(last5[0]) &&
d.isLeftParen(last5[1]) &&
last5[2].lit == "." &&
last5[3].lit == "." &&
last5[4].lit == "." &&
d.isRightParen(*currTok) {
d.tokens.popBack(3)
d.tokens.pushBack(token{genericSymbol, "?"})
return
}
}
// IN (?) => IN (...) Issue: #44298
func (d *sqlDigester) reduceInListWithSingleLiteral(currTok *token) {
last3 := d.tokens.back(3)
if len(last3) == 3 &&
d.isInKeyword(last3[0]) &&
d.isLeftParen(last3[1]) &&
last3[2].tok == genericSymbol &&
d.isRightParen(*currTok) {
d.tokens.popBack(1)
d.tokens.pushBack(token{genericSymbolList, "..."})
return
}
}
// In (Row(...)) => In (...) #51222
func (d *sqlDigester) reduceInRowListWithSingleLiteral(currTok *token) {
last5 := d.tokens.back(6)
if len(last5) == 6 &&
d.isInKeyword(last5[0]) &&
d.isLeftParen(last5[1]) &&
d.isRowKeyword(last5[2]) &&
d.isLeftParen(last5[3]) &&
(last5[4].tok == genericSymbolList || last5[4].tok == genericSymbol) &&
d.isRightParen(last5[5]) &&
d.isRightParen(*currTok) {
d.tokens.popBack(4)
d.tokens.pushBack(token{genericSymbolList, "..."})
return
}
}
func (d *sqlDigester) isPrefixByUnary(currTok int) (isUnary bool) {
if !d.isNumLit(currTok) {
return
}
last := d.tokens.back(1)
if last == nil {
return
}
// a[0] != '-' and a[0] != '+'
if last[0].lit != "-" && last[0].lit != "+" {
return
}
last2 := d.tokens.back(2)
if last2 == nil {
isUnary = true
return
}
// '(-x' or ',-x' or ',+x' or '--x' or '+-x'
switch last2[0].lit {
case "(", ",", "+", "-", ">=", "is", "<=", "=", "<", ">":
isUnary = true
default:
}
// select -x or select +x
last2Lit := strings.ToLower(last2[0].lit)
if last2Lit == "select" {
isUnary = true
}
return
}
func (d *sqlDigester) isGenericList(last2 []token) (generic bool) {
if len(last2) < 2 {
return false
}
if !d.isComma(last2[1]) {
return false
}
switch last2[0].tok {
case genericSymbol, genericSymbolList:
generic = true
default:
}
return
}
func (d *sqlDigester) isGenericListWithCharset(last []token) int {
if len(last) < 3 {
return 0
}
toPop := 0
if len(last) >= 4 {
// elminate the first _charset
if last[0].tok == underscoreCS {
toPop = 1
}
last = last[1:]
}
if last[2].tok != underscoreCS {
return 0
}
if !d.isGenericList(last[:2]) {
return 0
}
return toPop + 3
}
func (d *sqlDigester) isOrderOrGroupBy() (orderOrGroupBy bool) {
var (
last []token
n int
)
// skip number item lists, e.g. "order by 1, 2, 3" should NOT convert to "order by ?, ?, ?"
for n = 2; ; n += 2 {
last = d.tokens.back(n)
if len(last) < 2 {
return false
}
if !d.isComma(last[1]) {
break
}
}
// handle group by number item list surround by "()", e.g. "group by (1, 2)" should not convert to "group by (?, ?)"
if last[1].lit == "(" {
last = d.tokens.back(n + 1)
if len(last) < 2 {
return false
}
}
orderOrGroupBy = (last[0].lit == "order" || last[0].lit == "group") && last[1].lit == "by"
return
}
func (d *sqlDigester) isStarParam() (starParam bool) {
last := d.tokens.back(1)
if last == nil {
starParam = false
return
}
starParam = last[0].lit == "("
return
}
func (d *sqlDigester) isLit(t token) (beLit bool) {
tok := t.tok
if d.isNumLit(tok) || tok == stringLit || tok == bitLit || tok == paramMarker {
beLit = true
} else if t.lit == "*" {
beLit = true
} else if tok == null || (tok == identifier && strings.ToLower(t.lit) == "null") {
beLit = true
}
return
}
func (*sqlDigester) isNumLit(tok int) (beNum bool) {
switch tok {
case intLit, decLit, floatLit, hexLit:
beNum = true
default:
}
return
}
func (*sqlDigester) isComma(tok token) (isComma bool) {
isComma = tok.lit == ","
return
}
func (*sqlDigester) isLeftParen(tok token) (isLeftParen bool) {
isLeftParen = tok.lit == "("
return
}
func (*sqlDigester) isRightParen(tok token) (isLeftParen bool) {
isLeftParen = tok.lit == ")"
return
}
func (*sqlDigester) isInKeyword(tok token) (isInKeyword bool) {
isInKeyword = tok.lit == "in"
return
}
func (*sqlDigester) isRowKeyword(tok token) (isRowKeyword bool) {
isRowKeyword = tok.lit == "row"
return
}
type token struct {
tok int
lit string
}
type tokenDeque []token
func (s *tokenDeque) reset() {
*s = (*s)[:0]
}
func (s *tokenDeque) pushBack(t token) {
*s = append(*s, t)
}
func (s *tokenDeque) popBack(n int) (t []token) {
if len(*s) < n {
t = nil
return
}
t = (*s)[len(*s)-n:]
*s = (*s)[:len(*s)-n]
return
}
func (s *tokenDeque) back(n int) (t []token) {
if len(*s)-n < 0 {
return
}
t = (*s)[len(*s)-n:]
return
}