465 lines
15 KiB
Go
465 lines
15 KiB
Go
// Copyright 2022 PingCAP, Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package statistics
|
|
|
|
import (
|
|
"math"
|
|
"strconv"
|
|
|
|
"github.com/pingcap/errors"
|
|
"github.com/pingcap/tidb/parser/model"
|
|
"github.com/pingcap/tidb/parser/mysql"
|
|
"github.com/pingcap/tidb/sessionctx"
|
|
"github.com/pingcap/tidb/types"
|
|
"github.com/pingcap/tidb/util/chunk"
|
|
"github.com/pingcap/tidb/util/codec"
|
|
"github.com/pingcap/tidb/util/collate"
|
|
"github.com/pingcap/tidb/util/logutil"
|
|
"github.com/pingcap/tidb/util/mathutil"
|
|
"github.com/pingcap/tidb/util/ranger"
|
|
"go.uber.org/zap"
|
|
)
|
|
|
|
// Column represents a column histogram.
|
|
type Column struct {
|
|
Histogram
|
|
CMSketch *CMSketch
|
|
TopN *TopN
|
|
FMSketch *FMSketch
|
|
PhysicalID int64
|
|
Count int64
|
|
Info *model.ColumnInfo
|
|
IsHandle bool
|
|
ErrorRate
|
|
Flag int64
|
|
LastAnalyzePos types.Datum
|
|
StatsVer int64 // StatsVer is the version of the current stats, used to maintain compatibility
|
|
|
|
// StatsLoadedStatus indicates the status of column statistics
|
|
StatsLoadedStatus
|
|
}
|
|
|
|
func (c *Column) String() string {
|
|
return c.Histogram.ToString(0)
|
|
}
|
|
|
|
// TotalRowCount returns the total count of this column.
|
|
func (c *Column) TotalRowCount() float64 {
|
|
if c.StatsVer >= Version2 {
|
|
return c.Histogram.TotalRowCount() + float64(c.TopN.TotalCount())
|
|
}
|
|
return c.Histogram.TotalRowCount()
|
|
}
|
|
|
|
func (c *Column) notNullCount() float64 {
|
|
if c.StatsVer >= Version2 {
|
|
return c.Histogram.notNullCount() + float64(c.TopN.TotalCount())
|
|
}
|
|
return c.Histogram.notNullCount()
|
|
}
|
|
|
|
// GetIncreaseFactor get the increase factor to adjust the final estimated count when the table is modified.
|
|
func (c *Column) GetIncreaseFactor(realtimeRowCount int64) float64 {
|
|
columnCount := c.TotalRowCount()
|
|
if columnCount == 0 {
|
|
// avoid dividing by 0
|
|
return 1.0
|
|
}
|
|
return float64(realtimeRowCount) / columnCount
|
|
}
|
|
|
|
// MemoryUsage returns the total memory usage of Histogram, CMSketch, FMSketch in Column.
|
|
// We ignore the size of other metadata in Column
|
|
func (c *Column) MemoryUsage() CacheItemMemoryUsage {
|
|
var sum int64
|
|
columnMemUsage := &ColumnMemUsage{
|
|
ColumnID: c.Info.ID,
|
|
}
|
|
histogramMemUsage := c.Histogram.MemoryUsage()
|
|
columnMemUsage.HistogramMemUsage = histogramMemUsage
|
|
sum = histogramMemUsage
|
|
if c.CMSketch != nil {
|
|
cmSketchMemUsage := c.CMSketch.MemoryUsage()
|
|
columnMemUsage.CMSketchMemUsage = cmSketchMemUsage
|
|
sum += cmSketchMemUsage
|
|
}
|
|
if c.TopN != nil {
|
|
topnMemUsage := c.TopN.MemoryUsage()
|
|
columnMemUsage.TopNMemUsage = topnMemUsage
|
|
sum += topnMemUsage
|
|
}
|
|
if c.FMSketch != nil {
|
|
fmSketchMemUsage := c.FMSketch.MemoryUsage()
|
|
columnMemUsage.FMSketchMemUsage = fmSketchMemUsage
|
|
sum += fmSketchMemUsage
|
|
}
|
|
columnMemUsage.TotalMemUsage = sum
|
|
return columnMemUsage
|
|
}
|
|
|
|
// HistogramNeededItems stores the columns/indices whose Histograms need to be loaded from physical kv layer.
|
|
// Currently, we only load index/pk's Histogram from kv automatically. Columns' are loaded by needs.
|
|
var HistogramNeededItems = neededStatsMap{items: map[model.TableItemID]struct{}{}}
|
|
|
|
// IsInvalid checks if this column is invalid. If this column has histogram but not loaded yet, then we mark it
|
|
// as need histogram.
|
|
func (c *Column) IsInvalid(sctx sessionctx.Context, collPseudo bool) bool {
|
|
if collPseudo && c.NotAccurate() {
|
|
return true
|
|
}
|
|
if sctx != nil {
|
|
stmtctx := sctx.GetSessionVars().StmtCtx
|
|
if c.IsLoadNeeded() && stmtctx != nil {
|
|
if stmtctx.StatsLoad.Timeout > 0 {
|
|
logutil.BgLogger().Warn("Hist for column should already be loaded as sync but not found.",
|
|
zap.String(strconv.FormatInt(c.Info.ID, 10), c.Info.Name.O))
|
|
}
|
|
// In some tests, the c.Info is not set, so we add this check here.
|
|
if c.Info != nil {
|
|
HistogramNeededItems.insert(model.TableItemID{TableID: c.PhysicalID, ID: c.Info.ID, IsIndex: false})
|
|
}
|
|
}
|
|
}
|
|
// In some cases, some statistics in column would be evicted
|
|
// For example: the cmsketch of the column might be evicted while the histogram and the topn are still exists
|
|
// In this case, we will think this column as valid due to we can still use the rest of the statistics to do optimize.
|
|
return c.TotalRowCount() == 0 || (!c.IsEssentialStatsLoaded() && c.Histogram.NDV > 0)
|
|
}
|
|
|
|
func (c *Column) equalRowCount(sctx sessionctx.Context, val types.Datum, encodedVal []byte, realtimeRowCount int64) (float64, error) {
|
|
if val.IsNull() {
|
|
return float64(c.NullCount), nil
|
|
}
|
|
if c.StatsVer < Version2 {
|
|
// All the values are null.
|
|
if c.Histogram.Bounds.NumRows() == 0 {
|
|
return 0.0, nil
|
|
}
|
|
if c.Histogram.NDV > 0 && c.outOfRange(val) {
|
|
return outOfRangeEQSelectivity(c.Histogram.NDV, realtimeRowCount, int64(c.TotalRowCount())) * c.TotalRowCount(), nil
|
|
}
|
|
if c.CMSketch != nil {
|
|
count, err := queryValue(sctx.GetSessionVars().StmtCtx, c.CMSketch, c.TopN, val)
|
|
return float64(count), errors.Trace(err)
|
|
}
|
|
histRowCount, _ := c.Histogram.equalRowCount(val, false)
|
|
return histRowCount, nil
|
|
}
|
|
|
|
// Stats version == 2
|
|
// All the values are null.
|
|
if c.Histogram.Bounds.NumRows() == 0 && c.TopN.Num() == 0 {
|
|
return 0, nil
|
|
}
|
|
// 1. try to find this value in TopN
|
|
if c.TopN != nil {
|
|
rowcount, ok := c.TopN.QueryTopN(encodedVal)
|
|
if ok {
|
|
return float64(rowcount), nil
|
|
}
|
|
}
|
|
// 2. try to find this value in bucket.Repeat(the last value in every bucket)
|
|
histCnt, matched := c.Histogram.equalRowCount(val, true)
|
|
if matched {
|
|
return histCnt, nil
|
|
}
|
|
// 3. use uniform distribution assumption for the rest (even when this value is not covered by the range of stats)
|
|
histNDV := float64(c.Histogram.NDV - int64(c.TopN.Num()))
|
|
if histNDV <= 0 {
|
|
return 0, nil
|
|
}
|
|
return c.Histogram.notNullCount() / histNDV, nil
|
|
}
|
|
|
|
// GetColumnRowCount estimates the row count by a slice of Range.
|
|
func (c *Column) GetColumnRowCount(sctx sessionctx.Context, ranges []*ranger.Range, realtimeRowCount, modifyCount int64, pkIsHandle bool) (float64, error) {
|
|
sc := sctx.GetSessionVars().StmtCtx
|
|
var rowCount float64
|
|
for _, rg := range ranges {
|
|
highVal := *rg.HighVal[0].Clone()
|
|
lowVal := *rg.LowVal[0].Clone()
|
|
if highVal.Kind() == types.KindString {
|
|
highVal.SetBytes(collate.GetCollator(highVal.Collation()).Key(highVal.GetString()))
|
|
}
|
|
if lowVal.Kind() == types.KindString {
|
|
lowVal.SetBytes(collate.GetCollator(lowVal.Collation()).Key(lowVal.GetString()))
|
|
}
|
|
cmp, err := lowVal.Compare(sc, &highVal, collate.GetBinaryCollator())
|
|
if err != nil {
|
|
return 0, errors.Trace(err)
|
|
}
|
|
lowEncoded, err := codec.EncodeKey(sc, nil, lowVal)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
highEncoded, err := codec.EncodeKey(sc, nil, highVal)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
if cmp == 0 {
|
|
// case 1: it's a point
|
|
if !rg.LowExclude && !rg.HighExclude {
|
|
// In this case, the row count is at most 1.
|
|
if pkIsHandle {
|
|
rowCount++
|
|
continue
|
|
}
|
|
var cnt float64
|
|
cnt, err = c.equalRowCount(sctx, lowVal, lowEncoded, realtimeRowCount)
|
|
if err != nil {
|
|
return 0, errors.Trace(err)
|
|
}
|
|
// If the current table row count has changed, we should scale the row count accordingly.
|
|
cnt *= c.GetIncreaseFactor(realtimeRowCount)
|
|
rowCount += cnt
|
|
}
|
|
continue
|
|
}
|
|
// In stats ver 1, we use CM Sketch to estimate row count for point condition, which is more accurate.
|
|
// So for the small range, we convert it to points.
|
|
if c.StatsVer < 2 {
|
|
rangeVals := enumRangeValues(lowVal, highVal, rg.LowExclude, rg.HighExclude)
|
|
|
|
// case 2: it's a small range && using ver1 stats
|
|
if rangeVals != nil {
|
|
for _, val := range rangeVals {
|
|
cnt, err := c.equalRowCount(sctx, val, lowEncoded, realtimeRowCount)
|
|
if err != nil {
|
|
return 0, err
|
|
}
|
|
// If the current table row count has changed, we should scale the row count accordingly.
|
|
cnt *= c.GetIncreaseFactor(realtimeRowCount)
|
|
rowCount += cnt
|
|
}
|
|
|
|
continue
|
|
}
|
|
}
|
|
|
|
// case 3: it's an interval
|
|
cnt := c.BetweenRowCount(sctx, lowVal, highVal, lowEncoded, highEncoded)
|
|
// `betweenRowCount` returns count for [l, h) range, we adjust cnt for boundaries here.
|
|
// Note that, `cnt` does not include null values, we need specially handle cases
|
|
// where null is the lower bound.
|
|
// And because we use (2, MaxValue] to represent expressions like a > 2 and use [MinNotNull, 3) to represent
|
|
// expressions like b < 3, we need to exclude the special values.
|
|
if rg.LowExclude && !lowVal.IsNull() && lowVal.Kind() != types.KindMaxValue && lowVal.Kind() != types.KindMinNotNull {
|
|
lowCnt, err := c.equalRowCount(sctx, lowVal, lowEncoded, realtimeRowCount)
|
|
if err != nil {
|
|
return 0, errors.Trace(err)
|
|
}
|
|
cnt -= lowCnt
|
|
cnt = mathutil.Clamp(cnt, 0, c.notNullCount())
|
|
}
|
|
if !rg.LowExclude && lowVal.IsNull() {
|
|
cnt += float64(c.NullCount)
|
|
}
|
|
if !rg.HighExclude && highVal.Kind() != types.KindMaxValue && highVal.Kind() != types.KindMinNotNull {
|
|
highCnt, err := c.equalRowCount(sctx, highVal, highEncoded, realtimeRowCount)
|
|
if err != nil {
|
|
return 0, errors.Trace(err)
|
|
}
|
|
cnt += highCnt
|
|
}
|
|
|
|
cnt = mathutil.Clamp(cnt, 0, c.TotalRowCount())
|
|
|
|
// If the current table row count has changed, we should scale the row count accordingly.
|
|
cnt *= c.GetIncreaseFactor(realtimeRowCount)
|
|
|
|
// handling the out-of-range part
|
|
if (c.outOfRange(lowVal) && !lowVal.IsNull()) || c.outOfRange(highVal) {
|
|
cnt += c.Histogram.outOfRangeRowCount(&lowVal, &highVal, modifyCount)
|
|
}
|
|
|
|
rowCount += cnt
|
|
}
|
|
rowCount = mathutil.Clamp(rowCount, 0, float64(realtimeRowCount))
|
|
return rowCount, nil
|
|
}
|
|
|
|
// ItemID implements TableCacheItem
|
|
func (c *Column) ItemID() int64 {
|
|
return c.Info.ID
|
|
}
|
|
|
|
// DropEvicted implements TableCacheItem
|
|
// DropEvicted drops evicted structures
|
|
func (c *Column) DropEvicted() {
|
|
if !c.statsInitialized {
|
|
return
|
|
}
|
|
switch c.evictedStatus {
|
|
case allLoaded:
|
|
if c.CMSketch != nil && c.StatsVer < Version2 {
|
|
c.dropCMS()
|
|
return
|
|
}
|
|
// For stats version2, there is no cms thus we directly drop topn
|
|
c.dropTopN()
|
|
return
|
|
case onlyCmsEvicted:
|
|
c.dropTopN()
|
|
return
|
|
default:
|
|
return
|
|
}
|
|
}
|
|
|
|
func (c *Column) dropCMS() {
|
|
c.CMSketch = nil
|
|
c.evictedStatus = onlyCmsEvicted
|
|
}
|
|
|
|
func (c *Column) dropTopN() {
|
|
originTopNNum := int64(c.TopN.Num())
|
|
c.TopN = nil
|
|
if len(c.Histogram.Buckets) == 0 && originTopNNum >= c.Histogram.NDV {
|
|
// This indicates column has topn instead of histogram
|
|
c.evictedStatus = allEvicted
|
|
} else {
|
|
c.evictedStatus = onlyHistRemained
|
|
}
|
|
}
|
|
|
|
func (c *Column) dropHist() {
|
|
c.Histogram.Bounds = chunk.NewChunkWithCapacity([]*types.FieldType{types.NewFieldType(mysql.TypeBlob)}, 0)
|
|
c.Histogram.Buckets = make([]Bucket, 0)
|
|
c.Histogram.scalars = make([]scalar, 0)
|
|
c.evictedStatus = allEvicted
|
|
}
|
|
|
|
// IsAllEvicted indicates whether all stats evicted
|
|
func (c *Column) IsAllEvicted() bool {
|
|
return c.statsInitialized && c.evictedStatus >= allEvicted
|
|
}
|
|
|
|
func (c *Column) getEvictedStatus() int {
|
|
return c.evictedStatus
|
|
}
|
|
|
|
func (c *Column) isStatsInitialized() bool {
|
|
return c.statsInitialized
|
|
}
|
|
|
|
func (c *Column) statsVer() int64 {
|
|
return c.StatsVer
|
|
}
|
|
|
|
func (c *Column) isCMSExist() bool {
|
|
return c.CMSketch != nil
|
|
}
|
|
|
|
// AvgColSize is the average column size of the histogram. These sizes are derived from function `encode`
|
|
// and `Datum::ConvertTo`, so we need to update them if those 2 functions are changed.
|
|
func (c *Column) AvgColSize(count int64, isKey bool) float64 {
|
|
if count == 0 {
|
|
return 0
|
|
}
|
|
// Note that, if the handle column is encoded as value, instead of key, i.e,
|
|
// when the handle column is in a unique index, the real column size may be
|
|
// smaller than 8 because it is encoded using `EncodeVarint`. Since we don't
|
|
// know the exact value size now, use 8 as approximation.
|
|
if c.IsHandle {
|
|
return 8
|
|
}
|
|
histCount := c.TotalRowCount()
|
|
notNullRatio := 1.0
|
|
if histCount > 0 {
|
|
notNullRatio = 1.0 - float64(c.NullCount)/histCount
|
|
}
|
|
switch c.Histogram.Tp.GetType() {
|
|
case mysql.TypeFloat, mysql.TypeDouble, mysql.TypeDuration, mysql.TypeDate, mysql.TypeDatetime, mysql.TypeTimestamp:
|
|
return 8 * notNullRatio
|
|
case mysql.TypeTiny, mysql.TypeShort, mysql.TypeInt24, mysql.TypeLong, mysql.TypeLonglong, mysql.TypeYear, mysql.TypeEnum, mysql.TypeBit, mysql.TypeSet:
|
|
if isKey {
|
|
return 8 * notNullRatio
|
|
}
|
|
}
|
|
// Keep two decimal place.
|
|
return math.Round(float64(c.TotColSize)/float64(count)*100) / 100
|
|
}
|
|
|
|
// AvgColSizeChunkFormat is the average column size of the histogram. These sizes are derived from function `Encode`
|
|
// and `DecodeToChunk`, so we need to update them if those 2 functions are changed.
|
|
func (c *Column) AvgColSizeChunkFormat(count int64) float64 {
|
|
if count == 0 {
|
|
return 0
|
|
}
|
|
fixedLen := chunk.GetFixedLen(c.Histogram.Tp)
|
|
if fixedLen != -1 {
|
|
return float64(fixedLen)
|
|
}
|
|
// Keep two decimal place.
|
|
// Add 8 bytes for unfixed-len type's offsets.
|
|
// Minus Log2(avgSize) for unfixed-len type LEN.
|
|
avgSize := float64(c.TotColSize) / float64(count)
|
|
if avgSize < 1 {
|
|
return math.Round(avgSize*100)/100 + 8
|
|
}
|
|
return math.Round((avgSize-math.Log2(avgSize))*100)/100 + 8
|
|
}
|
|
|
|
// AvgColSizeListInDisk is the average column size of the histogram. These sizes are derived
|
|
// from `chunk.ListInDisk` so we need to update them if those 2 functions are changed.
|
|
func (c *Column) AvgColSizeListInDisk(count int64) float64 {
|
|
if count == 0 {
|
|
return 0
|
|
}
|
|
histCount := c.TotalRowCount()
|
|
notNullRatio := 1.0
|
|
if histCount > 0 {
|
|
notNullRatio = 1.0 - float64(c.NullCount)/histCount
|
|
}
|
|
size := chunk.GetFixedLen(c.Histogram.Tp)
|
|
if size != -1 {
|
|
return float64(size) * notNullRatio
|
|
}
|
|
// Keep two decimal place.
|
|
// Minus Log2(avgSize) for unfixed-len type LEN.
|
|
avgSize := float64(c.TotColSize) / float64(count)
|
|
if avgSize < 1 {
|
|
return math.Round((avgSize)*100) / 100
|
|
}
|
|
return math.Round((avgSize-math.Log2(avgSize))*100) / 100
|
|
}
|
|
|
|
// BetweenRowCount estimates the row count for interval [l, r).
|
|
func (c *Column) BetweenRowCount(sctx sessionctx.Context, l, r types.Datum, lowEncoded, highEncoded []byte) float64 {
|
|
histBetweenCnt := c.Histogram.BetweenRowCount(l, r)
|
|
if c.StatsVer <= Version1 {
|
|
return histBetweenCnt
|
|
}
|
|
return float64(c.TopN.BetweenCount(lowEncoded, highEncoded)) + histBetweenCnt
|
|
}
|
|
|
|
// StatusToString gets the string info of StatsLoadedStatus
|
|
func (s StatsLoadedStatus) StatusToString() string {
|
|
if !s.statsInitialized {
|
|
return "unInitialized"
|
|
}
|
|
switch s.evictedStatus {
|
|
case allLoaded:
|
|
return "allLoaded"
|
|
case onlyCmsEvicted:
|
|
return "onlyCmsEvicted"
|
|
case onlyHistRemained:
|
|
return "onlyHistRemained"
|
|
case allEvicted:
|
|
return "allEvicted"
|
|
}
|
|
return "unknown"
|
|
}
|