// Copyright 2019 PingCAP, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // See the License for the specific language governing permissions and // limitations under the License. package parser import ( "bytes" "crypto/sha256" "encoding/hex" hash2 "hash" "strings" "sync" "unsafe" "github.com/pingcap/errors" "github.com/pingcap/tidb/pkg/parser/charset" ) // Digest stores the fixed length hash value. type Digest struct { b []byte str string } // NewDigest returns a new digest. func NewDigest(b []byte) *Digest { return &Digest{ b: b, str: hex.EncodeToString(b), } } // String returns the digest hex string. func (d *Digest) String() string { return d.str } // Bytes returns the digest byte slice. func (d *Digest) Bytes() []byte { return d.b } // DigestHash generates the digest of statements. // it will generate a hash on normalized form of statement text // which removes general property of a statement but keeps specific property. // // for example: both DigestHash('select 1') and DigestHash('select 2') => e1c71d1661ae46e09b7aaec1c390957f0d6260410df4e4bc71b9c8d681021471 // // Deprecated: It is logically consistent with NormalizeDigest. func DigestHash(sql string) (digest *Digest) { d := digesterPool.Get().(*sqlDigester) digest = d.doDigest(sql) digesterPool.Put(d) return } // DigestNormalized generates the digest of a normalized sql. // it will generate a hash on a normalized sql. // Normalize + DigestNormalized equals to NormalizeDigest. // // for example: DigestNormalized('select ?') // DigestNormalized should be called with a normalized SQL string (like 'select ?') generated by function Normalize. // do not call with SQL which is not normalized, DigestNormalized('select 1') and DigestNormalized('select 2') is not the same func DigestNormalized(normalized string) (digest *Digest) { d := digesterPool.Get().(*sqlDigester) digest = d.doDigestNormalized(normalized) digesterPool.Put(d) return } // Normalize generates the normalized statements. // it will get normalized form of statement text // which removes general property of a statement but keeps specific property. // possible values for 'redact' is "OFF", "ON" or "MARKER". Passing "" is seen as "OFF". // // when "OFF", it is returned as is // for example, when "ON": Normalize('select 1 from b where a = 1') => 'select ? from b where a = ?' // for example, when "MARKER": Normalize('select 1 from b where a = 1') => 'select ‹1› from b where a = ‹1›' func Normalize(sql string, redact string) (result string) { if redact == "" || redact == errors.RedactLogDisable { return sql } d := digesterPool.Get().(*sqlDigester) result = d.doNormalize(sql, redact, false) digesterPool.Put(d) return } // NormalizeForBinding generates the normalized statements with additional binding rules // it will get normalized form of statement text // which removes general property of a statement but keeps specific property. // // for example: NormalizeForBinding('select 1 from b where a = 1') => 'select ? from b where a = ?' func NormalizeForBinding(sql string, forPlanReplayerReload bool) (result string) { d := digesterPool.Get().(*sqlDigester) result = d.doNormalizeForBinding(sql, false, forPlanReplayerReload) digesterPool.Put(d) return } // NormalizeKeepHint generates the normalized statements, but keep the hints. // it will get normalized form of statement text with hints. // which removes general property of a statement but keeps specific property. // // for example: Normalize('select /*+ use_index(t, primary) */ 1 from b where a = 1') => 'select /*+ use_index(t, primary) */ ? from b where a = ?' func NormalizeKeepHint(sql string) (result string) { d := digesterPool.Get().(*sqlDigester) result = d.doNormalize(sql, errors.RedactLogEnable, true) digesterPool.Put(d) return } // NormalizeDigest combines Normalize and DigestNormalized into one method. func NormalizeDigest(sql string) (normalized string, digest *Digest) { d := digesterPool.Get().(*sqlDigester) normalized, digest = d.doNormalizeDigest(sql) digesterPool.Put(d) return } // NormalizeDigestForBinding combines Normalize and DigestNormalized into one method with additional binding rules. func NormalizeDigestForBinding(sql string) (normalized string, digest *Digest) { d := digesterPool.Get().(*sqlDigester) normalized, digest = d.doNormalizeDigestForBinding(sql) digesterPool.Put(d) return } var digesterPool = sync.Pool{ New: func() interface{} { return &sqlDigester{ lexer: NewScanner(""), hasher: sha256.New(), } }, } // sqlDigester is used to compute DigestHash or Normalize for sql. type sqlDigester struct { buffer bytes.Buffer lexer *Scanner hasher hash2.Hash tokens tokenDeque } func (d *sqlDigester) doDigestNormalized(normalized string) (digest *Digest) { b := unsafe.Slice(unsafe.StringData(normalized), len(normalized)) d.hasher.Write(b) digest = NewDigest(d.hasher.Sum(nil)) d.hasher.Reset() return } func (d *sqlDigester) doDigest(sql string) (digest *Digest) { d.normalize(sql, errors.RedactLogEnable, false, false, false) d.hasher.Write(d.buffer.Bytes()) d.buffer.Reset() digest = NewDigest(d.hasher.Sum(nil)) d.hasher.Reset() return } func (d *sqlDigester) doNormalize(sql string, redact string, keepHint bool) (result string) { d.normalize(sql, redact, keepHint, false, false) result = d.buffer.String() d.buffer.Reset() return } func (d *sqlDigester) doNormalizeForBinding(sql string, keepHint bool, forPlanReplayerReload bool) (result string) { d.normalize(sql, errors.RedactLogEnable, keepHint, true, forPlanReplayerReload) result = d.buffer.String() d.buffer.Reset() return } func (d *sqlDigester) doNormalizeDigest(sql string) (normalized string, digest *Digest) { d.normalize(sql, errors.RedactLogEnable, false, false, false) normalized = d.buffer.String() d.hasher.Write(d.buffer.Bytes()) d.buffer.Reset() digest = NewDigest(d.hasher.Sum(nil)) d.hasher.Reset() return } func (d *sqlDigester) doNormalizeDigestForBinding(sql string) (normalized string, digest *Digest) { d.normalize(sql, errors.RedactLogEnable, false, true, false) normalized = d.buffer.String() d.hasher.Write(d.buffer.Bytes()) d.buffer.Reset() digest = NewDigest(d.hasher.Sum(nil)) d.hasher.Reset() return } const ( // genericSymbol presents parameter holder ("?") in statement // it can be any value as long as it is not repeated with other tokens. genericSymbol = -1 // genericSymbolList presents parameter holder lists ("?, ?, ...") in statement // it can be any value as long as it is not repeated with other tokens. genericSymbolList = -2 ) func (d *sqlDigester) normalize(sql string, redact string, keepHint bool, forBinding bool, forPlanReplayerReload bool) { d.lexer.reset(sql) d.lexer.setKeepHint(keepHint) for { tok, pos, lit := d.lexer.scan() if tok == invalid { break } if pos.Offset == len(sql) || (pos.Offset == len(sql)-1 && sql[pos.Offset] == ';') { break } currTok := token{tok, strings.ToLower(lit)} if !keepHint && d.reduceOptimizerHint(&currTok) { continue } d.reduceLit(&currTok, redact, forBinding, forPlanReplayerReload) if forPlanReplayerReload { // Apply for plan replayer to match specific rules, changing IN (...) to IN (?). This can avoid plan replayer load failures caused by parse errors. d.replaceSingleLiteralWithInList(&currTok) } else if forBinding { // Apply binding matching specific rules, IN (?) => IN ( ... ) #44298 d.reduceInListWithSingleLiteral(&currTok) // In (Row(...)) => In (...) #51222 d.reduceInRowListWithSingleLiteral(&currTok) } if currTok.tok == identifier { if strings.HasPrefix(currTok.lit, "_") { _, err := charset.GetCharsetInfo(currTok.lit[1:]) if err == nil { currTok.tok = underscoreCS goto APPEND } } if tok1 := d.lexer.isTokenIdentifier(currTok.lit, pos.Offset); tok1 != 0 { currTok.tok = tok1 } } APPEND: d.tokens.pushBack(currTok) } d.lexer.reset("") for i, token := range d.tokens { if i > 0 { d.buffer.WriteRune(' ') } if token.tok == singleAtIdentifier { d.buffer.WriteString("@") d.buffer.WriteString(token.lit) } else if token.tok == underscoreCS { d.buffer.WriteString("(_charset)") } else if token.tok == identifier || token.tok == quotedIdentifier { d.buffer.WriteByte('`') d.buffer.WriteString(token.lit) d.buffer.WriteByte('`') } else { d.buffer.WriteString(token.lit) } } d.tokens.reset() } func (d *sqlDigester) reduceOptimizerHint(tok *token) (reduced bool) { // ignore /*+..*/ if tok.tok == hintComment { return } // ignore force/use/ignore index(x) if tok.lit == "index" { toks := d.tokens.back(1) if len(toks) > 0 { switch strings.ToLower(toks[0].lit) { case "force", "use", "ignore": for { tok, _, lit := d.lexer.scan() if (tok == 0 && d.lexer.r.eof()) || tok == invalid { break } if lit == ")" { reduced = true d.tokens.popBack(1) break } } return } } } // ignore straight_join if tok.lit == "straight_join" { tok.lit = "join" return } return } func (d *sqlDigester) reduceLit(currTok *token, redact string, forBinding bool, forPlanReplayer bool) { if !d.isLit(*currTok) { return } if redact == errors.RedactLogMarker && !forBinding && !forPlanReplayer { switch currTok.lit { case "?", "*": return } input := currTok.lit b := &strings.Builder{} b.Grow(len(input)) _, _ = b.WriteRune('‹') for _, c := range input { if c == '‹' || c == '›' { _, _ = b.WriteRune(c) _, _ = b.WriteRune(c) } else { _, _ = b.WriteRune(c) } } _, _ = b.WriteRune('›') currTok.lit = b.String() return } // count(*) => count(?) if currTok.lit == "*" { if d.isStarParam() { currTok.tok = genericSymbol currTok.lit = "?" } return } // "-x" or "+x" => "x" if d.isPrefixByUnary(currTok.tok) { d.tokens.popBack(1) } // "?, ?, ?, ?" => "..." last2 := d.tokens.back(2) if d.isGenericList(last2) { d.tokens.popBack(2) currTok.tok = genericSymbolList currTok.lit = "..." return } // "_charset ?, _charset ?," => "..." last4 := d.tokens.back(4) if toPop := d.isGenericListWithCharset(last4); toPop != 0 { d.tokens.popBack(toPop) currTok.tok = genericSymbolList currTok.lit = "..." return } // Aggressive reduce lists. if d.isGenericLists(last4) { d.tokens.popBack(4) currTok.tok = genericSymbolList currTok.lit = "..." return } // reduce "In (row(...), row(...))" to "In (row(...))" // final, it will be reduced to "In (...)". Issue: #51222 if forBinding { last9 := d.tokens.back(9) if d.isGenericRowListsWithIn(last9) { d.tokens.popBack(5) currTok.tok = genericSymbolList currTok.lit = "..." return } } // order by n => order by n if currTok.tok == intLit { if d.isOrderOrGroupBy() { return } } // 2 => ? currTok.tok = genericSymbol currTok.lit = "?" } func (d *sqlDigester) isGenericLists(last4 []token) bool { if len(last4) < 4 { return false } if !(last4[0].tok == genericSymbol || last4[0].tok == genericSymbolList) { return false } if last4[1].lit != ")" { return false } if !d.isComma(last4[2]) { return false } if last4[3].lit != "(" { return false } return true } // In (Row(...), Row(...)) => In (Row(...)) func (d *sqlDigester) isGenericRowListsWithIn(last9 []token) bool { if len(last9) < 7 { return false } if !d.isInKeyword(last9[0]) { return false } if last9[1].lit != "(" { return false } if !d.isRowKeyword(last9[2]) { return false } if last9[3].lit != "(" { return false } if !(last9[4].tok == genericSymbol || last9[4].tok == genericSymbolList) { return false } if last9[5].lit != ")" { return false } if !d.isComma(last9[6]) { return false } if !d.isRowKeyword(last9[7]) { return false } if last9[8].lit != "(" { return false } return true } // IN (...) => IN (?) Issue: #43192 func (d *sqlDigester) replaceSingleLiteralWithInList(currTok *token) { last5 := d.tokens.back(5) if len(last5) == 5 && d.isInKeyword(last5[0]) && d.isLeftParen(last5[1]) && last5[2].lit == "." && last5[3].lit == "." && last5[4].lit == "." && d.isRightParen(*currTok) { d.tokens.popBack(3) d.tokens.pushBack(token{genericSymbol, "?"}) return } } // IN (?) => IN (...) Issue: #44298 func (d *sqlDigester) reduceInListWithSingleLiteral(currTok *token) { last3 := d.tokens.back(3) if len(last3) == 3 && d.isInKeyword(last3[0]) && d.isLeftParen(last3[1]) && last3[2].tok == genericSymbol && d.isRightParen(*currTok) { d.tokens.popBack(1) d.tokens.pushBack(token{genericSymbolList, "..."}) return } } // In (Row(...)) => In (...) #51222 func (d *sqlDigester) reduceInRowListWithSingleLiteral(currTok *token) { last5 := d.tokens.back(6) if len(last5) == 6 && d.isInKeyword(last5[0]) && d.isLeftParen(last5[1]) && d.isRowKeyword(last5[2]) && d.isLeftParen(last5[3]) && (last5[4].tok == genericSymbolList || last5[4].tok == genericSymbol) && d.isRightParen(last5[5]) && d.isRightParen(*currTok) { d.tokens.popBack(4) d.tokens.pushBack(token{genericSymbolList, "..."}) return } } func (d *sqlDigester) isPrefixByUnary(currTok int) (isUnary bool) { if !d.isNumLit(currTok) { return } last := d.tokens.back(1) if last == nil { return } // a[0] != '-' and a[0] != '+' if last[0].lit != "-" && last[0].lit != "+" { return } last2 := d.tokens.back(2) if last2 == nil { isUnary = true return } // '(-x' or ',-x' or ',+x' or '--x' or '+-x' switch last2[0].lit { case "(", ",", "+", "-", ">=", "is", "<=", "=", "<", ">": isUnary = true default: } // select -x or select +x last2Lit := strings.ToLower(last2[0].lit) if last2Lit == "select" { isUnary = true } return } func (d *sqlDigester) isGenericList(last2 []token) (generic bool) { if len(last2) < 2 { return false } if !d.isComma(last2[1]) { return false } switch last2[0].tok { case genericSymbol, genericSymbolList: generic = true default: } return } func (d *sqlDigester) isGenericListWithCharset(last []token) int { if len(last) < 3 { return 0 } toPop := 0 if len(last) >= 4 { // elminate the first _charset if last[0].tok == underscoreCS { toPop = 1 } last = last[1:] } if last[2].tok != underscoreCS { return 0 } if !d.isGenericList(last[:2]) { return 0 } return toPop + 3 } func (d *sqlDigester) isOrderOrGroupBy() (orderOrGroupBy bool) { var ( last []token n int ) // skip number item lists, e.g. "order by 1, 2, 3" should NOT convert to "order by ?, ?, ?" for n = 2; ; n += 2 { last = d.tokens.back(n) if len(last) < 2 { return false } if !d.isComma(last[1]) { break } } // handle group by number item list surround by "()", e.g. "group by (1, 2)" should not convert to "group by (?, ?)" if last[1].lit == "(" { last = d.tokens.back(n + 1) if len(last) < 2 { return false } } orderOrGroupBy = (last[0].lit == "order" || last[0].lit == "group") && last[1].lit == "by" return } func (d *sqlDigester) isStarParam() (starParam bool) { last := d.tokens.back(1) if last == nil { starParam = false return } starParam = last[0].lit == "(" return } func (d *sqlDigester) isLit(t token) (beLit bool) { tok := t.tok if d.isNumLit(tok) || tok == stringLit || tok == bitLit || tok == paramMarker { beLit = true } else if t.lit == "*" { beLit = true } else if tok == null || (tok == identifier && strings.ToLower(t.lit) == "null") { beLit = true } return } func (*sqlDigester) isNumLit(tok int) (beNum bool) { switch tok { case intLit, decLit, floatLit, hexLit: beNum = true default: } return } func (*sqlDigester) isComma(tok token) (isComma bool) { isComma = tok.lit == "," return } func (*sqlDigester) isLeftParen(tok token) (isLeftParen bool) { isLeftParen = tok.lit == "(" return } func (*sqlDigester) isRightParen(tok token) (isLeftParen bool) { isLeftParen = tok.lit == ")" return } func (*sqlDigester) isInKeyword(tok token) (isInKeyword bool) { isInKeyword = tok.lit == "in" return } func (*sqlDigester) isRowKeyword(tok token) (isRowKeyword bool) { isRowKeyword = tok.lit == "row" return } type token struct { tok int lit string } type tokenDeque []token func (s *tokenDeque) reset() { *s = (*s)[:0] } func (s *tokenDeque) pushBack(t token) { *s = append(*s, t) } func (s *tokenDeque) popBack(n int) (t []token) { if len(*s) < n { t = nil return } t = (*s)[len(*s)-n:] *s = (*s)[:len(*s)-n] return } func (s *tokenDeque) back(n int) (t []token) { if len(*s)-n < 0 { return } t = (*s)[len(*s)-n:] return }