// Copyright 2017 PingCAP, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package statistics import ( "cmp" "fmt" "slices" "strings" "sync" "github.com/pingcap/tidb/expression" "github.com/pingcap/tidb/parser/model" "github.com/pingcap/tidb/parser/mysql" "github.com/pingcap/tidb/sessionctx" "github.com/pingcap/tidb/types" "github.com/pingcap/tidb/util/ranger" "go.uber.org/atomic" "golang.org/x/exp/maps" ) const ( // PseudoVersion means the pseudo statistics version is 0. PseudoVersion uint64 = 0 // PseudoRowCount export for other pkg to use. // When we haven't analyzed a table, we use pseudo statistics to estimate costs. // It has row count 10000, equal condition selects 1/1000 of total rows, less condition selects 1/3 of total rows, // between condition selects 1/40 of total rows. PseudoRowCount = 10000 ) var ( // Below functions are used to solve cycle import problem. // Note: all functions below will be removed after finishing moving all estimation functions into the cardinality package. // GetRowCountByIndexRanges is a function type to get row count by index ranges. GetRowCountByIndexRanges func(sctx sessionctx.Context, coll *HistColl, idxID int64, indexRanges []*ranger.Range) (result float64, err error) // GetRowCountByIntColumnRanges is a function type to get row count by int column ranges. GetRowCountByIntColumnRanges func(sctx sessionctx.Context, coll *HistColl, colID int64, intRanges []*ranger.Range) (result float64, err error) // GetRowCountByColumnRanges is a function type to get row count by column ranges. GetRowCountByColumnRanges func(sctx sessionctx.Context, coll *HistColl, colID int64, colRanges []*ranger.Range) (result float64, err error) ) // Table represents statistics for a table. type Table struct { ExtendedStats *ExtendedStatsColl Name string HistColl Version uint64 // TblInfoUpdateTS is the UpdateTS of the TableInfo used when filling this struct. // It is the schema version of the corresponding table. It is used to skip redundant // loading of stats, i.e, if the cached stats is already update-to-date with mysql.stats_xxx tables, // and the schema of the table does not change, we don't need to load the stats for this // table again. TblInfoUpdateTS uint64 } // ExtendedStatsItem is the cached item of a mysql.stats_extended record. type ExtendedStatsItem struct { StringVals string ColIDs []int64 ScalarVals float64 Tp uint8 } // ExtendedStatsColl is a collection of cached items for mysql.stats_extended records. type ExtendedStatsColl struct { Stats map[string]*ExtendedStatsItem LastUpdateVersion uint64 } // NewExtendedStatsColl allocate an ExtendedStatsColl struct. func NewExtendedStatsColl() *ExtendedStatsColl { return &ExtendedStatsColl{Stats: make(map[string]*ExtendedStatsItem)} } const ( // ExtendedStatsInited is the status for extended stats which are just registered but have not been analyzed yet. ExtendedStatsInited uint8 = iota // ExtendedStatsAnalyzed is the status for extended stats which have been collected in analyze. ExtendedStatsAnalyzed // ExtendedStatsDeleted is the status for extended stats which were dropped. These "deleted" records would be removed from storage by GCStats(). ExtendedStatsDeleted ) // HistColl is a collection of histogram. It collects enough information for plan to calculate the selectivity. type HistColl struct { Columns map[int64]*Column Indices map[int64]*Index // Idx2ColumnIDs maps the index id to its column ids. It's used to calculate the selectivity in planner. Idx2ColumnIDs map[int64][]int64 // ColID2IdxIDs maps the column id to a list index ids whose first column is it. It's used to calculate the selectivity in planner. ColID2IdxIDs map[int64][]int64 PhysicalID int64 // TODO: add AnalyzeCount here RealtimeCount int64 // RealtimeCount is the current table row count, maintained by applying stats delta based on AnalyzeCount. ModifyCount int64 // Total modify count in a table. // HavePhysicalID is true means this HistColl is from single table and have its ID's information. // The physical id is used when try to load column stats from storage. HavePhysicalID bool Pseudo bool } // TableMemoryUsage records tbl memory usage type TableMemoryUsage struct { ColumnsMemUsage map[int64]CacheItemMemoryUsage IndicesMemUsage map[int64]CacheItemMemoryUsage TableID int64 TotalMemUsage int64 } // TotalIdxTrackingMemUsage returns total indices' tracking memory usage func (t *TableMemoryUsage) TotalIdxTrackingMemUsage() (sum int64) { for _, idx := range t.IndicesMemUsage { sum += idx.TrackingMemUsage() } return sum } // TotalColTrackingMemUsage returns total columns' tracking memory usage func (t *TableMemoryUsage) TotalColTrackingMemUsage() (sum int64) { for _, col := range t.ColumnsMemUsage { sum += col.TrackingMemUsage() } return sum } // TotalTrackingMemUsage return total tracking memory usage func (t *TableMemoryUsage) TotalTrackingMemUsage() int64 { return t.TotalIdxTrackingMemUsage() + t.TotalColTrackingMemUsage() } // TableCacheItem indicates the unit item stored in statsCache, eg: Column/Index type TableCacheItem interface { ItemID() int64 MemoryUsage() CacheItemMemoryUsage IsAllEvicted() bool GetEvictedStatus() int DropUnnecessaryData() IsStatsInitialized() bool GetStatsVer() int64 } // CacheItemMemoryUsage indicates the memory usage of TableCacheItem type CacheItemMemoryUsage interface { ItemID() int64 TotalMemoryUsage() int64 TrackingMemUsage() int64 HistMemUsage() int64 TopnMemUsage() int64 CMSMemUsage() int64 } // ColumnMemUsage records column memory usage type ColumnMemUsage struct { ColumnID int64 HistogramMemUsage int64 CMSketchMemUsage int64 FMSketchMemUsage int64 TopNMemUsage int64 TotalMemUsage int64 } // TotalMemoryUsage implements CacheItemMemoryUsage func (c *ColumnMemUsage) TotalMemoryUsage() int64 { return c.TotalMemUsage } // ItemID implements CacheItemMemoryUsage func (c *ColumnMemUsage) ItemID() int64 { return c.ColumnID } // TrackingMemUsage implements CacheItemMemoryUsage func (c *ColumnMemUsage) TrackingMemUsage() int64 { return c.CMSketchMemUsage + c.TopNMemUsage + c.HistogramMemUsage } // HistMemUsage implements CacheItemMemoryUsage func (c *ColumnMemUsage) HistMemUsage() int64 { return c.HistogramMemUsage } // TopnMemUsage implements CacheItemMemoryUsage func (c *ColumnMemUsage) TopnMemUsage() int64 { return c.TopNMemUsage } // CMSMemUsage implements CacheItemMemoryUsage func (c *ColumnMemUsage) CMSMemUsage() int64 { return c.CMSketchMemUsage } // IndexMemUsage records index memory usage type IndexMemUsage struct { IndexID int64 HistogramMemUsage int64 CMSketchMemUsage int64 TopNMemUsage int64 TotalMemUsage int64 } // TotalMemoryUsage implements CacheItemMemoryUsage func (c *IndexMemUsage) TotalMemoryUsage() int64 { return c.TotalMemUsage } // ItemID implements CacheItemMemoryUsage func (c *IndexMemUsage) ItemID() int64 { return c.IndexID } // TrackingMemUsage implements CacheItemMemoryUsage func (c *IndexMemUsage) TrackingMemUsage() int64 { return c.CMSketchMemUsage + c.TopNMemUsage + c.HistogramMemUsage } // HistMemUsage implements CacheItemMemoryUsage func (c *IndexMemUsage) HistMemUsage() int64 { return c.HistogramMemUsage } // TopnMemUsage implements CacheItemMemoryUsage func (c *IndexMemUsage) TopnMemUsage() int64 { return c.TopNMemUsage } // CMSMemUsage implements CacheItemMemoryUsage func (c *IndexMemUsage) CMSMemUsage() int64 { return c.CMSketchMemUsage } // MemoryUsage returns the total memory usage of this Table. // it will only calc the size of Columns and Indices stats data of table. // We ignore the size of other metadata in Table func (t *Table) MemoryUsage() *TableMemoryUsage { tMemUsage := &TableMemoryUsage{ TableID: t.PhysicalID, ColumnsMemUsage: make(map[int64]CacheItemMemoryUsage), IndicesMemUsage: make(map[int64]CacheItemMemoryUsage), } for _, col := range t.Columns { if col != nil { colMemUsage := col.MemoryUsage() tMemUsage.ColumnsMemUsage[colMemUsage.ItemID()] = colMemUsage tMemUsage.TotalMemUsage += colMemUsage.TotalMemoryUsage() } } for _, index := range t.Indices { if index != nil { idxMemUsage := index.MemoryUsage() tMemUsage.IndicesMemUsage[idxMemUsage.ItemID()] = idxMemUsage tMemUsage.TotalMemUsage += idxMemUsage.TotalMemoryUsage() } } return tMemUsage } // Copy copies the current table. func (t *Table) Copy() *Table { newHistColl := HistColl{ PhysicalID: t.PhysicalID, HavePhysicalID: t.HavePhysicalID, RealtimeCount: t.RealtimeCount, Columns: make(map[int64]*Column, len(t.Columns)), Indices: make(map[int64]*Index, len(t.Indices)), Pseudo: t.Pseudo, ModifyCount: t.ModifyCount, } for id, col := range t.Columns { newHistColl.Columns[id] = col.Copy() } for id, idx := range t.Indices { newHistColl.Indices[id] = idx.Copy() } nt := &Table{ HistColl: newHistColl, Version: t.Version, Name: t.Name, TblInfoUpdateTS: t.TblInfoUpdateTS, } if t.ExtendedStats != nil { newExtStatsColl := &ExtendedStatsColl{ Stats: make(map[string]*ExtendedStatsItem), LastUpdateVersion: t.ExtendedStats.LastUpdateVersion, } for name, item := range t.ExtendedStats.Stats { newExtStatsColl.Stats[name] = item } nt.ExtendedStats = newExtStatsColl } return nt } // ShallowCopy copies the current table. // It's different from Copy(). Only the struct Table (and also the embedded HistColl) is copied here. // The internal containers, like t.Columns and t.Indices, and the stats, like TopN and Histogram are not copied. func (t *Table) ShallowCopy() *Table { newHistColl := HistColl{ PhysicalID: t.PhysicalID, HavePhysicalID: t.HavePhysicalID, RealtimeCount: t.RealtimeCount, Columns: t.Columns, Indices: t.Indices, Pseudo: t.Pseudo, ModifyCount: t.ModifyCount, } nt := &Table{ HistColl: newHistColl, Version: t.Version, Name: t.Name, TblInfoUpdateTS: t.TblInfoUpdateTS, ExtendedStats: t.ExtendedStats, } return nt } // String implements Stringer interface. func (t *Table) String() string { strs := make([]string, 0, len(t.Columns)+1) strs = append(strs, fmt.Sprintf("Table:%d RealtimeCount:%d", t.PhysicalID, t.RealtimeCount)) cols := make([]*Column, 0, len(t.Columns)) for _, col := range t.Columns { cols = append(cols, col) } slices.SortFunc(cols, func(i, j *Column) int { return cmp.Compare(i.ID, j.ID) }) for _, col := range cols { strs = append(strs, col.String()) } idxs := make([]*Index, 0, len(t.Indices)) for _, idx := range t.Indices { idxs = append(idxs, idx) } slices.SortFunc(idxs, func(i, j *Index) int { return cmp.Compare(i.ID, j.ID) }) for _, idx := range idxs { strs = append(strs, idx.String()) } // TODO: concat content of ExtendedStatsColl return strings.Join(strs, "\n") } // IndexStartWithColumn finds the first index whose first column is the given column. func (t *Table) IndexStartWithColumn(colName string) *Index { for _, index := range t.Indices { if index.Info.Columns[0].Name.L == colName { return index } } return nil } // ColumnByName finds the statistics.Column for the given column. func (t *Table) ColumnByName(colName string) *Column { for _, c := range t.Columns { if c.Info.Name.L == colName { return c } } return nil } // GetStatsInfo returns their statistics according to the ID of the column or index, including histogram, CMSketch, TopN and FMSketch. func (t *Table) GetStatsInfo(id int64, isIndex bool) (*Histogram, *CMSketch, *TopN, *FMSketch, bool) { if isIndex { if idxStatsInfo, ok := t.Indices[id]; ok { return idxStatsInfo.Histogram.Copy(), idxStatsInfo.CMSketch.Copy(), idxStatsInfo.TopN.Copy(), idxStatsInfo.FMSketch.Copy(), true } // newly added index which is not analyzed yet return nil, nil, nil, nil, false } if colStatsInfo, ok := t.Columns[id]; ok { return colStatsInfo.Histogram.Copy(), colStatsInfo.CMSketch.Copy(), colStatsInfo.TopN.Copy(), colStatsInfo.FMSketch.Copy(), true } // newly added column which is not analyzed yet return nil, nil, nil, nil, false } // GetAnalyzeRowCount tries to get the row count of a column or an index if possible. // This method is useful because this row count doesn't consider the modify count. func (coll *HistColl) GetAnalyzeRowCount() float64 { ids := maps.Keys(coll.Columns) slices.Sort(ids) for _, id := range ids { col := coll.Columns[id] if col != nil && col.IsFullLoad() { return col.TotalRowCount() } } ids = maps.Keys(coll.Indices) slices.Sort(ids) for _, id := range ids { idx := coll.Indices[id] if idx != nil && idx.IsFullLoad() { return idx.TotalRowCount() } } return -1 } // GetStatsHealthy calculates stats healthy if the table stats is not pseudo. // If the table stats is pseudo, it returns 0, false, otherwise it returns stats healthy, true. func (t *Table) GetStatsHealthy() (int64, bool) { if t == nil || t.Pseudo { return 0, false } var healthy int64 count := float64(t.RealtimeCount) if histCount := t.GetAnalyzeRowCount(); histCount > 0 { count = histCount } if float64(t.ModifyCount) < count { healthy = int64((1.0 - float64(t.ModifyCount)/count) * 100.0) } else if t.ModifyCount == 0 { healthy = 100 } return healthy, true } type neededStatsMap struct { items map[model.TableItemID]struct{} m sync.RWMutex } func (n *neededStatsMap) AllItems() []model.TableItemID { n.m.RLock() keys := make([]model.TableItemID, 0, len(n.items)) for key := range n.items { keys = append(keys, key) } n.m.RUnlock() return keys } func (n *neededStatsMap) insert(col model.TableItemID) { n.m.Lock() n.items[col] = struct{}{} n.m.Unlock() } func (n *neededStatsMap) Delete(col model.TableItemID) { n.m.Lock() delete(n.items, col) n.m.Unlock() } func (n *neededStatsMap) Length() int { n.m.RLock() defer n.m.RUnlock() return len(n.items) } // RatioOfPseudoEstimate means if modifyCount / statsTblCount is greater than this ratio, we think the stats is invalid // and use pseudo estimation. var RatioOfPseudoEstimate = atomic.NewFloat64(0.7) // IsInitialized returns true if any column/index stats of the table is initialized. func (t *Table) IsInitialized() bool { for _, col := range t.Columns { if col != nil && col.IsStatsInitialized() { return true } } for _, idx := range t.Indices { if idx != nil && idx.IsStatsInitialized() { return true } } return false } // IsOutdated returns true if the table stats is outdated. func (t *Table) IsOutdated() bool { rowcount := t.GetAnalyzeRowCount() if rowcount < 0 { rowcount = float64(t.RealtimeCount) } if rowcount > 0 && float64(t.ModifyCount)/rowcount > RatioOfPseudoEstimate.Load() { return true } return false } // ID2UniqueID generates a new HistColl whose `Columns` is built from UniqueID of given columns. func (coll *HistColl) ID2UniqueID(columns []*expression.Column) *HistColl { cols := make(map[int64]*Column) for _, col := range columns { colHist, ok := coll.Columns[col.ID] if ok { cols[col.UniqueID] = colHist } } newColl := &HistColl{ PhysicalID: coll.PhysicalID, HavePhysicalID: coll.HavePhysicalID, Pseudo: coll.Pseudo, RealtimeCount: coll.RealtimeCount, ModifyCount: coll.ModifyCount, Columns: cols, } return newColl } // GenerateHistCollFromColumnInfo generates a new HistColl whose ColID2IdxIDs and IdxID2ColIDs is built from the given parameter. func (coll *HistColl) GenerateHistCollFromColumnInfo(tblInfo *model.TableInfo, columns []*expression.Column) *HistColl { newColHistMap := make(map[int64]*Column) colInfoID2UniqueID := make(map[int64]int64, len(columns)) idxID2idxInfo := make(map[int64]*model.IndexInfo) for _, col := range columns { colInfoID2UniqueID[col.ID] = col.UniqueID } for id, colHist := range coll.Columns { uniqueID, ok := colInfoID2UniqueID[id] // Collect the statistics by the given columns. if ok { newColHistMap[uniqueID] = colHist } } for _, idxInfo := range tblInfo.Indices { idxID2idxInfo[idxInfo.ID] = idxInfo } newIdxHistMap := make(map[int64]*Index) idx2Columns := make(map[int64][]int64) colID2IdxIDs := make(map[int64][]int64) for id, idxHist := range coll.Indices { idxInfo := idxID2idxInfo[id] if idxInfo == nil { continue } ids := make([]int64, 0, len(idxInfo.Columns)) for _, idxCol := range idxInfo.Columns { uniqueID, ok := colInfoID2UniqueID[tblInfo.Columns[idxCol.Offset].ID] if !ok { break } ids = append(ids, uniqueID) } // If the length of the id list is 0, this index won't be used in this query. if len(ids) == 0 { continue } colID2IdxIDs[ids[0]] = append(colID2IdxIDs[ids[0]], idxHist.ID) newIdxHistMap[idxHist.ID] = idxHist idx2Columns[idxHist.ID] = ids } for _, idxIDs := range colID2IdxIDs { slices.Sort(idxIDs) } newColl := &HistColl{ PhysicalID: coll.PhysicalID, HavePhysicalID: coll.HavePhysicalID, Pseudo: coll.Pseudo, RealtimeCount: coll.RealtimeCount, ModifyCount: coll.ModifyCount, Columns: newColHistMap, Indices: newIdxHistMap, ColID2IdxIDs: colID2IdxIDs, Idx2ColumnIDs: idx2Columns, } return newColl } // PseudoTable creates a pseudo table statistics. // Usually, we don't want to trigger stats loading for pseudo table. // But there are exceptional cases. In such cases, we should pass allowTriggerLoading as true. // Such case could possibly happen in getStatsTable(). func PseudoTable(tblInfo *model.TableInfo, allowTriggerLoading bool) *Table { const fakePhysicalID int64 = -1 pseudoHistColl := HistColl{ RealtimeCount: PseudoRowCount, PhysicalID: tblInfo.ID, HavePhysicalID: true, Columns: make(map[int64]*Column, len(tblInfo.Columns)), Indices: make(map[int64]*Index, len(tblInfo.Indices)), Pseudo: true, } t := &Table{ HistColl: pseudoHistColl, } for _, col := range tblInfo.Columns { // The column is public to use. Also we should check the column is not hidden since hidden means that it's used by expression index. // We would not collect stats for the hidden column and we won't use the hidden column to estimate. // Thus we don't create pseudo stats for it. if col.State == model.StatePublic && !col.Hidden { t.Columns[col.ID] = &Column{ PhysicalID: fakePhysicalID, Info: col, IsHandle: tblInfo.PKIsHandle && mysql.HasPriKeyFlag(col.GetFlag()), Histogram: *NewHistogram(col.ID, 0, 0, 0, &col.FieldType, 0, 0), } if allowTriggerLoading { t.Columns[col.ID].PhysicalID = tblInfo.ID } } } for _, idx := range tblInfo.Indices { if idx.State == model.StatePublic { t.Indices[idx.ID] = &Index{ PhysicalID: fakePhysicalID, Info: idx, Histogram: *NewHistogram(idx.ID, 0, 0, 0, types.NewFieldType(mysql.TypeBlob), 0, 0), } if allowTriggerLoading { t.Indices[idx.ID].PhysicalID = tblInfo.ID } } } return t } // CheckAnalyzeVerOnTable checks whether the given version is the one from the tbl. // If not, it will return false and set the version to the tbl's. // We use this check to make sure all the statistics of the table are in the same version. func CheckAnalyzeVerOnTable(tbl *Table, version *int) bool { for _, col := range tbl.Columns { if !col.IsAnalyzed() { continue } if col.StatsVer != int64(*version) { *version = int(col.StatsVer) return false } // If we found one column and the version is the same, we can directly return since all the versions from this table is the same. return true } for _, idx := range tbl.Indices { if !idx.IsAnalyzed() { continue } if idx.StatsVer != int64(*version) { *version = int(idx.StatsVer) return false } // If we found one column and the version is the same, we can directly return since all the versions from this table is the same. return true } // This table has no statistics yet. We can directly return true. return true }