// Copyright 2017 PingCAP, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package statistics import ( "cmp" "fmt" "maps" "slices" "strings" "github.com/pingcap/tidb/pkg/expression" "github.com/pingcap/tidb/pkg/meta/model" "github.com/pingcap/tidb/pkg/parser/mysql" "github.com/pingcap/tidb/pkg/planner/planctx" "github.com/pingcap/tidb/pkg/types" "github.com/pingcap/tidb/pkg/util/ranger" "go.uber.org/atomic" ) const ( // PseudoVersion means the pseudo statistics version is 0. PseudoVersion uint64 = 0 // PseudoRowCount export for other pkg to use. // When we haven't analyzed a table, we use pseudo statistics to estimate costs. // It has row count 10000, equal condition selects 1/1000 of total rows, less condition selects 1/3 of total rows, // between condition selects 1/40 of total rows. PseudoRowCount = 10000 ) // CopyIntent specifies what data structures are safe to modify in the copied table. type CopyIntent uint8 const ( // MetaOnly shares all maps - only table metadata is safe to modify MetaOnly CopyIntent = iota // ColumnMapWritable clones columns map - safe to add/remove columns ColumnMapWritable // IndexMapWritable clones indices map - safe to add/remove indices IndexMapWritable // BothMapsWritable clones both maps - safe to add/remove columns and indices BothMapsWritable // ExtendedStatsWritable shares all maps - safe to modify ExtendedStats field ExtendedStatsWritable // AllDataWritable deep copies everything - safe to modify all data including histograms AllDataWritable ) // AutoAnalyzeMinCnt means if the count of table is less than this value, we don't need to do auto analyze. // Exported for testing. var AutoAnalyzeMinCnt int64 = 1000 var ( // Below functions are used to solve cycle import problem. // Note: all functions below will be removed after finishing moving all estimation functions into the cardinality package. // GetRowCountByIndexRanges is a function type to get row count by index ranges. GetRowCountByIndexRanges func(sctx planctx.PlanContext, coll *HistColl, idxID int64, indexRanges []*ranger.Range, idxCol []*expression.Column) (result RowEstimate, err error) // GetRowCountByColumnRanges is a function type to get row count by column ranges. GetRowCountByColumnRanges func(sctx planctx.PlanContext, coll *HistColl, colID int64, colRanges []*ranger.Range, pkIsHandle bool) (result RowEstimate, err error) ) // Table represents statistics for a table. type Table struct { ExtendedStats *ExtendedStatsColl ColAndIdxExistenceMap *ColAndIdxExistenceMap HistColl Version uint64 // It's the timestamp of the last analyze time. // We used it in auto-analyze to determine if this table has been analyzed. // The source of this field comes from two parts: // 1. Initialized by snapshot when loading stats_meta. // 2. Updated by the analysis time of a specific column or index when loading the histogram of the column or index. LastAnalyzeVersion uint64 // LastStatsHistVersion is the mvcc version of the last update of histograms. // It differs from LastAnalyzeVersion because it can be influenced by some DDL. // e.g. When we execute ALTER TABLE ADD COLUMN, there'll be new record inserted into mysql.stats_histograms. // We need to load the corresponding one into memory too. // It's used to skip redundant loading of stats, i.e, if the cached stats is already update-to-date with mysql.stats_xxx tables, // and the schema of the table does not change, we don't need to load the stats for this table again. // Stats' sync load/async load should not change this field since they are not table-level update. // It's hard to deal with the upgrade compatibility of this field, the field will not take effect unless // auto analyze or DDL happened on the table. LastStatsHistVersion uint64 // TblInfoUpdateTS is the UpdateTS of the TableInfo used when filling this struct. // It is the schema version of the corresponding table. It is used to skip redundant // loading of stats, i.e, if the cached stats is already update-to-date with mysql.stats_xxx tables, // and the schema of the table does not change, we don't need to load the stats for this // table again. // TODO: it can be removed now that we've have LastAnalyseVersion and LastStatsHistVersion. TblInfoUpdateTS uint64 IsPkIsHandle bool } // ColAndIdxExistenceMap is the meta map for statistics.Table. // It can tell whether a column/index really has its statistics. So we won't send useless kv request when we do online stats loading. type ColAndIdxExistenceMap struct { colAnalyzed map[int64]bool idxAnalyzed map[int64]bool } // DeleteColNotFound deletes the column with the given id. func (m *ColAndIdxExistenceMap) DeleteColNotFound(id int64) { delete(m.colAnalyzed, id) } // DeleteIdxNotFound deletes the index with the given id. func (m *ColAndIdxExistenceMap) DeleteIdxNotFound(id int64) { delete(m.idxAnalyzed, id) } // HasAnalyzed checks whether a column/index stats exists and it has stats. // TODO: the map should only keep the analyzed cols. // There's three possible status of column/index's statistics: // 1. We don't have this column/index. // 2. We have it, but it hasn't been analyzed yet. // 3. We have it and its statistics. // // To figure out three status, we use HasAnalyzed's TRUE value to represents the status 3. The Has's FALSE to represents the status 1. // Begin from v8.5.2, the 1. case becomes a nearly invalid case. It's just a middle state between happening of the DDL and the completion of the stats' ddl handler. // But we may need to deal with the 1. for the upgrade compatibility. func (m *ColAndIdxExistenceMap) HasAnalyzed(id int64, isIndex bool) bool { if isIndex { analyzed, ok := m.idxAnalyzed[id] return ok && analyzed } analyzed, ok := m.colAnalyzed[id] return ok && analyzed } // Has checks whether a column/index stats exists. func (m *ColAndIdxExistenceMap) Has(id int64, isIndex bool) bool { if isIndex { _, ok := m.idxAnalyzed[id] return ok } _, ok := m.colAnalyzed[id] return ok } // InsertCol inserts a column with its meta into the map. func (m *ColAndIdxExistenceMap) InsertCol(id int64, analyzed bool) { m.colAnalyzed[id] = analyzed } // InsertIndex inserts an index with its meta into the map. func (m *ColAndIdxExistenceMap) InsertIndex(id int64, analyzed bool) { m.idxAnalyzed[id] = analyzed } // IsEmpty checks whether the map is empty. func (m *ColAndIdxExistenceMap) IsEmpty() bool { return len(m.colAnalyzed)+len(m.idxAnalyzed) == 0 } // ColNum returns the number of columns in the map. func (m *ColAndIdxExistenceMap) ColNum() int { return len(m.colAnalyzed) } // Clone deeply copies the map. func (m *ColAndIdxExistenceMap) Clone() *ColAndIdxExistenceMap { mm := NewColAndIndexExistenceMap(len(m.colAnalyzed), len(m.idxAnalyzed)) mm.colAnalyzed = maps.Clone(m.colAnalyzed) mm.idxAnalyzed = maps.Clone(m.idxAnalyzed) return mm } const ( defaultColCap = 16 defaultIdxCap = 4 ) // NewColAndIndexExistenceMapWithoutSize return a new object with default capacity. func NewColAndIndexExistenceMapWithoutSize() *ColAndIdxExistenceMap { return &ColAndIdxExistenceMap{ colAnalyzed: make(map[int64]bool, defaultColCap), idxAnalyzed: make(map[int64]bool, defaultIdxCap), } } // NewColAndIndexExistenceMap return a new object with the given capcity. func NewColAndIndexExistenceMap(colCap, idxCap int) *ColAndIdxExistenceMap { return &ColAndIdxExistenceMap{ colAnalyzed: make(map[int64]bool, colCap), idxAnalyzed: make(map[int64]bool, idxCap), } } // ColAndIdxExistenceMapIsEqual is used in testing, checking whether the two are equal. func ColAndIdxExistenceMapIsEqual(m1, m2 *ColAndIdxExistenceMap) bool { return maps.Equal(m1.colAnalyzed, m2.colAnalyzed) && maps.Equal(m1.idxAnalyzed, m2.idxAnalyzed) } // ExtendedStatsItem is the cached item of a mysql.stats_extended record. type ExtendedStatsItem struct { StringVals string ColIDs []int64 ScalarVals float64 Tp uint8 } // ExtendedStatsColl is a collection of cached items for mysql.stats_extended records. type ExtendedStatsColl struct { Stats map[string]*ExtendedStatsItem LastUpdateVersion uint64 } // NewExtendedStatsColl allocate an ExtendedStatsColl struct. func NewExtendedStatsColl() *ExtendedStatsColl { return &ExtendedStatsColl{Stats: make(map[string]*ExtendedStatsItem)} } const ( // ExtendedStatsInited is the status for extended stats which are just registered but have not been analyzed yet. ExtendedStatsInited uint8 = iota // ExtendedStatsAnalyzed is the status for extended stats which have been collected in analyze. ExtendedStatsAnalyzed // ExtendedStatsDeleted is the status for extended stats which were dropped. These "deleted" records would be removed from storage by GCStats(). ExtendedStatsDeleted ) // HistColl is a collection of histograms. It collects enough information for plan to calculate the selectivity. type HistColl struct { // Note that when used in a query, Column use UniqueID as the key while Indices use the index ID in the // metadata. (See GenerateHistCollFromColumnInfo() for details) columns map[int64]*Column indices map[int64]*Index PhysicalID int64 // TODO: add AnalyzeCount here RealtimeCount int64 // RealtimeCount is the current table row count, maintained by applying stats delta based on AnalyzeCount. ModifyCount int64 // Total modify count in a table. // The version of the statistics, refer to Version0, Version1, Version2 and so on. StatsVer int Pseudo bool /* Fields below are only used in a query, like for estimation, and they will be useless when stored in the stats cache. (See GenerateHistCollFromColumnInfo() for details) */ CanNotTriggerLoad bool // Idx2ColUniqueIDs maps the index id to its column UniqueIDs. It's used to calculate the selectivity in planner. Idx2ColUniqueIDs map[int64][]int64 // ColUniqueID2IdxIDs maps the column UniqueID to a list index ids whose first column is it. // It's used to calculate the selectivity in planner. ColUniqueID2IdxIDs map[int64][]int64 // UniqueID2colInfoID maps the column UniqueID to its ID in the metadata. UniqueID2colInfoID map[int64]int64 // MVIdx2Columns maps the index id to its columns by expression.Column. // For normal index, the column id is enough, as we already have in Idx2ColUniqueIDs. But currently, mv index needs more // information to match the filter against the mv index columns, and we need this map to provide this information. MVIdx2Columns map[int64][]*expression.Column } // NewHistColl creates a new HistColl. func NewHistColl(id int64, realtimeCnt, modifyCnt int64, colNum, idxNum int) *HistColl { return &HistColl{ columns: make(map[int64]*Column, colNum), indices: make(map[int64]*Index, idxNum), PhysicalID: id, RealtimeCount: realtimeCnt, ModifyCount: modifyCnt, Idx2ColUniqueIDs: make(map[int64][]int64), ColUniqueID2IdxIDs: make(map[int64][]int64), UniqueID2colInfoID: make(map[int64]int64), MVIdx2Columns: make(map[int64][]*expression.Column), } } // NewHistCollWithColsAndIdxs creates a new HistColl with given columns and indices. func NewHistCollWithColsAndIdxs(id int64, realtimeCnt, modifyCnt int64, cols map[int64]*Column, idxs map[int64]*Index) *HistColl { return &HistColl{ columns: cols, indices: idxs, PhysicalID: id, RealtimeCount: realtimeCnt, ModifyCount: modifyCnt, Idx2ColUniqueIDs: make(map[int64][]int64), ColUniqueID2IdxIDs: make(map[int64][]int64), UniqueID2colInfoID: make(map[int64]int64), MVIdx2Columns: make(map[int64][]*expression.Column), } } // SetCol sets the column with the given id. func (coll *HistColl) SetCol(id int64, col *Column) { coll.columns[id] = col } // SetIdx sets the index with the given id. func (coll *HistColl) SetIdx(id int64, idx *Index) { coll.indices[id] = idx } // GetCol gets the column with the given id. func (coll *HistColl) GetCol(id int64) *Column { return coll.columns[id] } // GetIdx gets the index with the given id. func (coll *HistColl) GetIdx(id int64) *Index { return coll.indices[id] } // ForEachColumnImmutable iterates all columns in the HistColl. // The bool return value of f is used to control the iteration. If f returns true, the iteration will be stopped. // Warning: Don't change the content when calling this function. func (coll *HistColl) ForEachColumnImmutable(f func(int64, *Column) bool) { for id, col := range coll.columns { if f(id, col) { return } } } // ForEachIndexImmutable iterates all columns in the HistColl. // The bool return value of f is used to control the iteration. If f returns true, the iteration will be stopped. // WARNING: Don't change the content when calling this function. func (coll *HistColl) ForEachIndexImmutable(f func(int64, *Index) bool) { for id, idx := range coll.indices { if f(id, idx) { return } } } // ColNum returns the number of columns in the HistColl. func (coll *HistColl) ColNum() int { return len(coll.columns) } // IdxNum returns the number of indices in the HistColl. func (coll *HistColl) IdxNum() int { return len(coll.indices) } // DelCol deletes the column with the given id. func (t *Table) DelCol(id int64) { delete(t.columns, id) t.ColAndIdxExistenceMap.DeleteColNotFound(id) } // DelIdx deletes the index with the given id. func (t *Table) DelIdx(id int64) { delete(t.indices, id) t.ColAndIdxExistenceMap.DeleteIdxNotFound(id) } // StableOrderColSlice returns a slice of columns in stable order. func (coll *HistColl) StableOrderColSlice() []*Column { cols := make([]*Column, 0, len(coll.columns)) for _, col := range coll.columns { cols = append(cols, col) } slices.SortFunc(cols, func(c1, c2 *Column) int { return cmp.Compare(c1.ID, c2.ID) }) return cols } // GetColSlice returns a slice of columns without order. func (coll *HistColl) GetColSlice() []*Column { cols := make([]*Column, 0, len(coll.columns)) for _, col := range coll.columns { cols = append(cols, col) } return cols } // StableOrderIdxSlice returns a slice of indices in stable order. func (coll *HistColl) StableOrderIdxSlice() []*Index { idxs := make([]*Index, 0, len(coll.indices)) for _, idx := range coll.indices { idxs = append(idxs, idx) } slices.SortFunc(idxs, func(i1, i2 *Index) int { return cmp.Compare(i1.ID, i2.ID) }) return idxs } // GetIdxSlice returns a slice of indices without order. func (coll *HistColl) GetIdxSlice() []*Index { idxs := make([]*Index, 0, len(coll.indices)) for _, idx := range coll.indices { idxs = append(idxs, idx) } return idxs } // SetAllIndexFullLoadForBootstrap sets all indices' stats loaded status to full load for bootstrap. func (coll *HistColl) SetAllIndexFullLoadForBootstrap() { for _, idx := range coll.indices { idx.StatsLoadedStatus = NewStatsFullLoadStatus() } } // CalcPreScalar calculates the pre-calculated scalar for all columns and indices. func (coll *HistColl) CalcPreScalar() { for _, idx := range coll.indices { for i := 1; i < idx.Len(); i++ { idx.Buckets[i].Count += idx.Buckets[i-1].Count } idx.PreCalculateScalar() } for _, col := range coll.columns { for i := 1; i < col.Len(); i++ { col.Buckets[i].Count += col.Buckets[i-1].Count } col.PreCalculateScalar() } } // DropEvicted will drop the unnecessary data for all columns and indices. It's triggerred by stats cache. func (coll *HistColl) DropEvicted() { for _, col := range coll.columns { if !col.IsStatsInitialized() || col.GetEvictedStatus() == AllEvicted { continue } col.DropUnnecessaryData() } for _, idx := range coll.indices { if !idx.IsStatsInitialized() || idx.GetEvictedStatus() == AllEvicted { continue } idx.DropUnnecessaryData() } } // TableMemoryUsage records tbl memory usage type TableMemoryUsage struct { ColumnsMemUsage map[int64]CacheItemMemoryUsage IndicesMemUsage map[int64]CacheItemMemoryUsage TableID int64 TotalMemUsage int64 } // TotalIdxTrackingMemUsage returns total indices' tracking memory usage func (t *TableMemoryUsage) TotalIdxTrackingMemUsage() (sum int64) { for _, idx := range t.IndicesMemUsage { sum += idx.TrackingMemUsage() } return sum } // TotalColTrackingMemUsage returns total columns' tracking memory usage func (t *TableMemoryUsage) TotalColTrackingMemUsage() (sum int64) { for _, col := range t.ColumnsMemUsage { sum += col.TrackingMemUsage() } return sum } // TotalTrackingMemUsage return total tracking memory usage func (t *TableMemoryUsage) TotalTrackingMemUsage() int64 { return t.TotalIdxTrackingMemUsage() + t.TotalColTrackingMemUsage() } // TableCacheItem indicates the unit item stored in statsCache, eg: Column/Index type TableCacheItem interface { ItemID() int64 MemoryUsage() CacheItemMemoryUsage IsAllEvicted() bool GetEvictedStatus() int DropUnnecessaryData() IsStatsInitialized() bool GetStatsVer() int64 } // CacheItemMemoryUsage indicates the memory usage of TableCacheItem type CacheItemMemoryUsage interface { ItemID() int64 TotalMemoryUsage() int64 TrackingMemUsage() int64 HistMemUsage() int64 TopnMemUsage() int64 CMSMemUsage() int64 } // ColumnMemUsage records column memory usage type ColumnMemUsage struct { ColumnID int64 HistogramMemUsage int64 CMSketchMemUsage int64 FMSketchMemUsage int64 TopNMemUsage int64 TotalMemUsage int64 } // TotalMemoryUsage implements CacheItemMemoryUsage func (c *ColumnMemUsage) TotalMemoryUsage() int64 { return c.TotalMemUsage } // ItemID implements CacheItemMemoryUsage func (c *ColumnMemUsage) ItemID() int64 { return c.ColumnID } // TrackingMemUsage implements CacheItemMemoryUsage func (c *ColumnMemUsage) TrackingMemUsage() int64 { return c.CMSketchMemUsage + c.TopNMemUsage + c.HistogramMemUsage } // HistMemUsage implements CacheItemMemoryUsage func (c *ColumnMemUsage) HistMemUsage() int64 { return c.HistogramMemUsage } // TopnMemUsage implements CacheItemMemoryUsage func (c *ColumnMemUsage) TopnMemUsage() int64 { return c.TopNMemUsage } // CMSMemUsage implements CacheItemMemoryUsage func (c *ColumnMemUsage) CMSMemUsage() int64 { return c.CMSketchMemUsage } // IndexMemUsage records index memory usage type IndexMemUsage struct { IndexID int64 HistogramMemUsage int64 CMSketchMemUsage int64 TopNMemUsage int64 TotalMemUsage int64 } // TotalMemoryUsage implements CacheItemMemoryUsage func (c *IndexMemUsage) TotalMemoryUsage() int64 { return c.TotalMemUsage } // ItemID implements CacheItemMemoryUsage func (c *IndexMemUsage) ItemID() int64 { return c.IndexID } // TrackingMemUsage implements CacheItemMemoryUsage func (c *IndexMemUsage) TrackingMemUsage() int64 { return c.CMSketchMemUsage + c.TopNMemUsage + c.HistogramMemUsage } // HistMemUsage implements CacheItemMemoryUsage func (c *IndexMemUsage) HistMemUsage() int64 { return c.HistogramMemUsage } // TopnMemUsage implements CacheItemMemoryUsage func (c *IndexMemUsage) TopnMemUsage() int64 { return c.TopNMemUsage } // CMSMemUsage implements CacheItemMemoryUsage func (c *IndexMemUsage) CMSMemUsage() int64 { return c.CMSketchMemUsage } // MemoryUsage returns the total memory usage of this Table. // it will only calc the size of Columns and Indices stats data of table. // We ignore the size of other metadata in Table func (t *Table) MemoryUsage() *TableMemoryUsage { tMemUsage := &TableMemoryUsage{ TableID: t.PhysicalID, ColumnsMemUsage: make(map[int64]CacheItemMemoryUsage), IndicesMemUsage: make(map[int64]CacheItemMemoryUsage), } for _, col := range t.columns { if col != nil { colMemUsage := col.MemoryUsage() tMemUsage.ColumnsMemUsage[colMemUsage.ItemID()] = colMemUsage tMemUsage.TotalMemUsage += colMemUsage.TotalMemoryUsage() } } for _, index := range t.indices { if index != nil { idxMemUsage := index.MemoryUsage() tMemUsage.IndicesMemUsage[idxMemUsage.ItemID()] = idxMemUsage tMemUsage.TotalMemUsage += idxMemUsage.TotalMemoryUsage() } } return tMemUsage } // CopyAs creates a copy of the table with the specified writability intent. // // PERFORMANCE NOTE: Choose the most minimal intent for your use case. Copying is heavily // used at scale and unnecessary cloning causes significant memory pressure. Only use // AllDataWritable when you truly need to modify histogram data. // // MetaOnly: Shares all maps, only metadata modifications are safe // ColumnMapWritable: Clones columns map, safe to add/remove columns // IndexMapWritable: Clones indices map, safe to add/remove indices // BothMapsWritable: Clones both maps - safe to add/remove columns and indices // ExtendedStatsWritable: Shares all maps, safe to modify ExtendedStats field // AllDataWritable: Deep copies everything, safe to modify all data including histograms func (t *Table) CopyAs(intent CopyIntent) *Table { var columns map[int64]*Column var indices map[int64]*Index var existenceMap *ColAndIdxExistenceMap switch intent { case MetaOnly: columns = t.columns indices = t.indices existenceMap = t.ColAndIdxExistenceMap case ColumnMapWritable: columns = maps.Clone(t.columns) indices = t.indices if t.ColAndIdxExistenceMap != nil { existenceMap = t.ColAndIdxExistenceMap.Clone() } case IndexMapWritable: columns = t.columns indices = maps.Clone(t.indices) if t.ColAndIdxExistenceMap != nil { existenceMap = t.ColAndIdxExistenceMap.Clone() } case BothMapsWritable: columns = maps.Clone(t.columns) indices = maps.Clone(t.indices) if t.ColAndIdxExistenceMap != nil { existenceMap = t.ColAndIdxExistenceMap.Clone() } case ExtendedStatsWritable: columns = t.columns indices = t.indices existenceMap = t.ColAndIdxExistenceMap case AllDataWritable: // For deep copy, create new maps and deep copy all content columns = make(map[int64]*Column, len(t.columns)) for id, col := range t.columns { columns[id] = col.Copy() } indices = make(map[int64]*Index, len(t.indices)) for id, idx := range t.indices { indices[id] = idx.Copy() } if t.ColAndIdxExistenceMap != nil { existenceMap = t.ColAndIdxExistenceMap.Clone() } } newHistColl := HistColl{ PhysicalID: t.PhysicalID, RealtimeCount: t.RealtimeCount, columns: columns, indices: indices, Pseudo: t.Pseudo, ModifyCount: t.ModifyCount, StatsVer: t.StatsVer, } nt := &Table{ HistColl: newHistColl, Version: t.Version, TblInfoUpdateTS: t.TblInfoUpdateTS, ColAndIdxExistenceMap: existenceMap, LastAnalyzeVersion: t.LastAnalyzeVersion, LastStatsHistVersion: t.LastStatsHistVersion, } // Handle ExtendedStats for deep copy vs shallow copy if (intent == AllDataWritable || intent == ExtendedStatsWritable) && t.ExtendedStats != nil { newExtStatsColl := &ExtendedStatsColl{ Stats: make(map[string]*ExtendedStatsItem), LastUpdateVersion: t.ExtendedStats.LastUpdateVersion, } maps.Copy(newExtStatsColl.Stats, t.ExtendedStats.Stats) nt.ExtendedStats = newExtStatsColl } else { nt.ExtendedStats = t.ExtendedStats } return nt } // String implements Stringer interface. func (t *Table) String() string { strs := make([]string, 0, len(t.columns)+1) strs = append(strs, fmt.Sprintf("Table:%d RealtimeCount:%d", t.PhysicalID, t.RealtimeCount)) cols := make([]*Column, 0, len(t.columns)) for _, col := range t.columns { cols = append(cols, col) } slices.SortFunc(cols, func(i, j *Column) int { return cmp.Compare(i.ID, j.ID) }) for _, col := range cols { strs = append(strs, col.String()) } idxs := make([]*Index, 0, len(t.indices)) for _, idx := range t.indices { idxs = append(idxs, idx) } slices.SortFunc(idxs, func(i, j *Index) int { return cmp.Compare(i.ID, j.ID) }) for _, idx := range idxs { strs = append(strs, idx.String()) } // TODO: concat content of ExtendedStatsColl return strings.Join(strs, "\n") } // IndexStartWithColumn finds the first index whose first column is the given column. func (t *Table) IndexStartWithColumn(colName string) *Index { for _, index := range t.indices { if index.Info.Columns[0].Name.L == colName { return index } } return nil } // ColumnByName finds the statistics.Column for the given column. func (t *Table) ColumnByName(colName string) *Column { for _, c := range t.columns { if c.Info.Name.L == colName { return c } } return nil } // GetStatsInfo returns their statistics according to the ID of the column or index, including histogram, CMSketch, TopN and FMSketch. // // needCopy: In order to protect the item in the cache from being damaged, we need to copy the item. func (t *Table) GetStatsInfo(id int64, isIndex bool, needCopy bool) (*Histogram, *CMSketch, *TopN, *FMSketch, bool) { if isIndex { if idxStatsInfo, ok := t.indices[id]; ok { if needCopy { return idxStatsInfo.Histogram.Copy(), idxStatsInfo.CMSketch.Copy(), idxStatsInfo.TopN.Copy(), idxStatsInfo.FMSketch.Copy(), true } return &idxStatsInfo.Histogram, idxStatsInfo.CMSketch, idxStatsInfo.TopN, idxStatsInfo.FMSketch, true } // newly added index which is not analyzed yet return nil, nil, nil, nil, false } if colStatsInfo, ok := t.columns[id]; ok { if needCopy { return colStatsInfo.Histogram.Copy(), colStatsInfo.CMSketch.Copy(), colStatsInfo.TopN.Copy(), colStatsInfo.FMSketch.Copy(), true } return &colStatsInfo.Histogram, colStatsInfo.CMSketch, colStatsInfo.TopN, colStatsInfo.FMSketch, true } // newly added column which is not analyzed yet return nil, nil, nil, nil, false } // IsAnalyzed checks whether the table is analyzed or not by checking its last analyze's timestamp value. // A valid timestamp must be greater than 0. func (t *Table) IsAnalyzed() bool { return t.LastAnalyzeVersion > 0 } // IsEligibleForAnalysis checks whether the table is eligible for analysis. func (t *Table) IsEligibleForAnalysis() bool { // 1. If the statistics are either not loaded or are classified as pseudo, there is no need for analyze. // Pseudo statistics can be created by the optimizer, so we need to double check it. // 2. If the table is too small, we don't want to waste time to analyze it. // Leave the opportunity to other bigger tables. if !t.MeetAutoAnalyzeMinCnt() || t.Pseudo { return false } return true } // MeetAutoAnalyzeMinCnt checks whether the table meets the minimum count required for auto-analyze. func (t *Table) MeetAutoAnalyzeMinCnt() bool { return t != nil && t.RealtimeCount >= AutoAnalyzeMinCnt } // GetAnalyzeRowCount tries to get the row count of a column or an index if possible. // This method is useful because this row count doesn't consider the modify count. func (coll *HistColl) GetAnalyzeRowCount() float64 { ids := slices.Collect(maps.Keys(coll.columns)) slices.Sort(ids) for _, id := range ids { col := coll.columns[id] if col != nil && col.IsFullLoad() { return col.TotalRowCount() } } clear(ids) ids = slices.Grow(ids, len(coll.indices)) ids = slices.AppendSeq(ids, maps.Keys(coll.indices)) slices.Sort(ids) for _, id := range ids { idx := coll.indices[id] if idx == nil { continue } if idx.Info != nil && idx.Info.MVIndex { continue } if idx.IsFullLoad() { return idx.TotalRowCount() } } return -1 } // GetScaledRealtimeAndModifyCnt scale the RealtimeCount and ModifyCount for some special indexes where the total row // count is different from the total row count of the table. Currently, only the mv index is this case. // Because we will use the RealtimeCount and ModifyCount during the estimation for ranges on this index (like the upper // bound for the out-of-range estimation logic and the IncreaseFactor logic), we can't directly use the RealtimeCount and // ModifyCount of the table. Instead, we should scale them before using. // For example, if the table analyze row count is 1000 and realtime row count is 1500, and the mv index total count is 5000, // when calculating the IncreaseFactor, it should be 1500/1000 = 1.5 for normal columns/indexes, and we should use the // same 1.5 for mv index. But obviously, use 1500/5000 would be wrong, the correct calculation should be 7500/5000 = 1.5. // So we add this function to get this 7500. func (coll *HistColl) GetScaledRealtimeAndModifyCnt(idxStats *Index) (realtimeCnt, modifyCnt int64) { // In theory, we can apply this scale logic on all indexes. But currently, we only apply it on the mv index to avoid // any unexpected changes caused by factors like precision difference. if idxStats == nil || idxStats.Info == nil || !idxStats.Info.MVIndex || !idxStats.IsFullLoad() { return coll.RealtimeCount, coll.ModifyCount } analyzeRowCount := coll.GetAnalyzeRowCount() if analyzeRowCount <= 0 { return coll.RealtimeCount, coll.ModifyCount } idxTotalRowCount := idxStats.TotalRowCount() if idxTotalRowCount <= 0 { return coll.RealtimeCount, coll.ModifyCount } scale := idxTotalRowCount / analyzeRowCount return int64(float64(coll.RealtimeCount) * scale), int64(float64(coll.ModifyCount) * scale) } // GetStatsHealthy calculates stats healthy if the table stats is not pseudo. // If the table stats is pseudo, it returns 0, false, otherwise it returns stats healthy, true. func (t *Table) GetStatsHealthy() (int64, bool) { if t == nil || t.Pseudo { return 0, false } if !t.IsAnalyzed() { return 0, true } var healthy int64 count := float64(t.RealtimeCount) if histCount := t.GetAnalyzeRowCount(); histCount > 0 { count = histCount } if float64(t.ModifyCount) < count { healthy = int64((1.0 - float64(t.ModifyCount)/count) * 100.0) } else if t.ModifyCount == 0 { healthy = 100 } return healthy, true } // ColumnIsLoadNeeded checks whether the column needs trigger the async/sync load. // The Column should be visible in the table and really has analyzed statistics in the storage. // Also, if the stats has been loaded into the memory, we also don't need to load it. // We return the Column together with the checking result, to avoid accessing the map multiple times. // The first bool is whether we need to load it into memory. The second bool is whether this column has stats in the system table or not. func (t *Table) ColumnIsLoadNeeded(id int64, fullLoad bool) (col *Column, loadNeeded, hasAnalyzed bool) { if t.Pseudo { return nil, false, false } hasAnalyzed = t.ColAndIdxExistenceMap.HasAnalyzed(id, false) col, ok := t.columns[id] if !ok { // If The column have no stats object in memory. We need to check it by existence map. // If existence map says it even has no unitialized record in storage, we don't need to do anything. => Has=false, HasAnalyzed=false // If existence map says it has analyzed stats, we need to load it from storage. => Has=true, HasAnalyzed=true // If existence map says it has no analyzed stats but have a uninitialized record in storage, we need to also create a fake object. => Has=true, HasAnalyzed=false return nil, t.ColAndIdxExistenceMap.Has(id, false), hasAnalyzed } // If it's not analyzed yet. // The real check condition: !ok && !hashAnalyzed.(Has must be true since we've have the memory object so we should have the storage object) // After this check, we will always have ok && hasAnalyzed. if !hasAnalyzed { return nil, false, false } // Restore the condition from the simplified form: // 1. ok && hasAnalyzed && fullLoad && !col.IsFullLoad => need load // 2. ok && hasAnalyzed && !fullLoad && !col.statsInitialized => need load if (fullLoad && !col.IsFullLoad()) || (!fullLoad && !col.statsInitialized) { return col, true, true } // Otherwise don't need load it. return col, false, true } // IndexIsLoadNeeded checks whether the index needs trigger the async/sync load. // The Index should be visible in the table and really has analyzed statistics in the storage. // Also, if the stats has been loaded into the memory, we also don't need to load it. // We return the Index together with the checking result, to avoid accessing the map multiple times. func (t *Table) IndexIsLoadNeeded(id int64) (*Index, bool) { idx, ok := t.indices[id] // If the index is not in the memory, and we have its stats in the storage. We need to trigger the load. if !ok && t.ColAndIdxExistenceMap.HasAnalyzed(id, true) { return nil, true } // If the index is in the memory, we check its embedded func. if ok && idx.IsAnalyzed() && !idx.IsFullLoad() { return idx, true } return idx, false } // RatioOfPseudoEstimate means if modifyCount / statsTblCount is greater than this ratio, we think the stats is invalid // and use pseudo estimation. var RatioOfPseudoEstimate = atomic.NewFloat64(0.7) // IsInitialized returns true if any column/index stats of the table is initialized. func (t *Table) IsInitialized() bool { for _, col := range t.columns { if col != nil && col.IsStatsInitialized() { return true } } for _, idx := range t.indices { if idx != nil && idx.IsStatsInitialized() { return true } } return false } // IsOutdated returns true if the table stats is outdated. func (t *Table) IsOutdated() bool { rowcount := t.GetAnalyzeRowCount() if rowcount < 0 { rowcount = float64(t.RealtimeCount) } if rowcount > 0 && float64(t.ModifyCount)/rowcount > RatioOfPseudoEstimate.Load() { return true } return false } // ReleaseAndPutToPool releases data structures of Table and put itself back to pool. func (t *Table) ReleaseAndPutToPool() { for _, col := range t.columns { col.FMSketch.DestroyAndPutToPool() } clear(t.columns) for _, idx := range t.indices { idx.FMSketch.DestroyAndPutToPool() } clear(t.indices) } // ID2UniqueID generates a new HistColl whose `Columns` is built from UniqueID of given columns. func (coll *HistColl) ID2UniqueID(columns []*expression.Column) *HistColl { cols := make(map[int64]*Column) for _, col := range columns { colHist, ok := coll.columns[col.ID] if ok { cols[col.UniqueID] = colHist } } newColl := &HistColl{ PhysicalID: coll.PhysicalID, Pseudo: coll.Pseudo, RealtimeCount: coll.RealtimeCount, ModifyCount: coll.ModifyCount, columns: cols, } return newColl } // GenerateHistCollFromColumnInfo generates a new HistColl whose ColUniqueID2IdxIDs and Idx2ColUniqueIDs is built from the given parameter. func (coll *HistColl) GenerateHistCollFromColumnInfo(tblInfo *model.TableInfo, columns []*expression.Column) *HistColl { newColHistMap := make(map[int64]*Column) colInfoID2Col := make(map[int64]*expression.Column, len(columns)) colInfoID2UniqueID := make(map[int64]int64, len(columns)) uniqueID2colInfoID := make(map[int64]int64, len(columns)) idxID2idxInfo := make(map[int64]*model.IndexInfo) for _, col := range columns { colInfoID2Col[col.ID] = col colInfoID2UniqueID[col.ID] = col.UniqueID uniqueID2colInfoID[col.UniqueID] = col.ID } for id, colHist := range coll.columns { uniqueID, ok := colInfoID2UniqueID[id] // Collect the statistics by the given columns. if ok { newColHistMap[uniqueID] = colHist } } for _, idxInfo := range tblInfo.Indices { idxID2idxInfo[idxInfo.ID] = idxInfo } newIdxHistMap := make(map[int64]*Index) idx2Columns := make(map[int64][]int64) colID2IdxIDs := make(map[int64][]int64) mvIdx2Columns := make(map[int64][]*expression.Column) for id, idxHist := range coll.indices { idxInfo := idxID2idxInfo[id] if idxInfo == nil { continue } ids := make([]int64, 0, len(idxInfo.Columns)) for _, idxCol := range idxInfo.Columns { uniqueID, ok := colInfoID2UniqueID[tblInfo.Columns[idxCol.Offset].ID] if !ok { break } ids = append(ids, uniqueID) } // If the length of the id list is 0, this index won't be used in this query. if len(ids) == 0 { continue } colID2IdxIDs[ids[0]] = append(colID2IdxIDs[ids[0]], idxHist.ID) newIdxHistMap[idxHist.ID] = idxHist idx2Columns[idxHist.ID] = ids if idxInfo.MVIndex { cols, ok := PrepareCols4MVIndex(tblInfo, idxInfo, colInfoID2Col, true) if ok { mvIdx2Columns[id] = cols } } } for _, idxIDs := range colID2IdxIDs { slices.Sort(idxIDs) } newColl := &HistColl{ PhysicalID: coll.PhysicalID, Pseudo: coll.Pseudo, RealtimeCount: coll.RealtimeCount, ModifyCount: coll.ModifyCount, columns: newColHistMap, indices: newIdxHistMap, ColUniqueID2IdxIDs: colID2IdxIDs, Idx2ColUniqueIDs: idx2Columns, UniqueID2colInfoID: uniqueID2colInfoID, MVIdx2Columns: mvIdx2Columns, } return newColl } // PseudoHistColl creates a lightweight pseudo HistColl for cost calculation. // This is optimized for cases where only HistColl is needed, avoiding the overhead // of creating a full pseudo table with ColAndIdxExistenceMap and other structures. func PseudoHistColl(physicalID int64, allowTriggerLoading bool) HistColl { return HistColl{ RealtimeCount: PseudoRowCount, PhysicalID: physicalID, columns: nil, indices: nil, Pseudo: true, CanNotTriggerLoad: !allowTriggerLoading, ModifyCount: 0, StatsVer: 0, } } // PseudoTable creates a pseudo table statistics. // Usually, we don't want to trigger stats loading for pseudo table. // But there are exceptional cases. In such cases, we should pass allowTriggerLoading as true. // Such case could possibly happen in getStatsTable(). func PseudoTable(tblInfo *model.TableInfo, allowTriggerLoading bool, allowFillHistMeta bool) *Table { t := &Table{ HistColl: PseudoHistColl(tblInfo.ID, allowTriggerLoading), Version: PseudoVersion, ColAndIdxExistenceMap: NewColAndIndexExistenceMap(len(tblInfo.Columns), len(tblInfo.Indices)), } // Initialize columns and indices maps only when allowFillHistMeta is true if allowFillHistMeta { t.columns = make(map[int64]*Column, len(tblInfo.Columns)) t.indices = make(map[int64]*Index, len(tblInfo.Indices)) } for _, col := range tblInfo.Columns { // The column is public to use. Also we should check the column is not hidden since hidden means that it's used by expression index. // We would not collect stats for the hidden column and we won't use the hidden column to estimate. // Thus we don't create pseudo stats for it. if col.State == model.StatePublic && !col.Hidden { t.ColAndIdxExistenceMap.InsertCol(col.ID, false) if allowFillHistMeta { t.columns[col.ID] = &Column{ PhysicalID: tblInfo.ID, Info: col, IsHandle: tblInfo.PKIsHandle && mysql.HasPriKeyFlag(col.GetFlag()), Histogram: *NewPseudoHistogram(col.ID, &col.FieldType), } } } } for _, idx := range tblInfo.Indices { if idx.State == model.StatePublic { t.ColAndIdxExistenceMap.InsertIndex(idx.ID, false) if allowFillHistMeta { t.indices[idx.ID] = &Index{ PhysicalID: tblInfo.ID, Info: idx, Histogram: *NewPseudoHistogram(idx.ID, types.NewFieldType(mysql.TypeBlob)), } } } } return t } // CheckAnalyzeVerOnTable checks whether the given version is the one from the tbl. // If not, it will return false and set the version to the tbl's. // We use this check to make sure all the statistics of the table are in the same version. func CheckAnalyzeVerOnTable(tbl *Table, version *int) bool { if IsAnalyzed(int64(tbl.StatsVer)) && tbl.StatsVer != *version { *version = tbl.StatsVer return false } return true } // PrepareCols4MVIndex helps to identify the columns of an MV index. We need this information for estimation. // This logic is shared between the estimation logic and the access path generation logic. We'd like to put the mv index // related functions together in the planner/core package. So we use this trick here to avoid the import cycle. var PrepareCols4MVIndex func( tableInfo *model.TableInfo, mvIndex *model.IndexInfo, tblColsByID map[int64]*expression.Column, checkOnly1ArrayTypeCol bool, ) (idxCols []*expression.Column, ok bool)