tidb/pkg/statistics/table.go

// Copyright 2017 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package statistics

import (
	"cmp"
	"fmt"
	"maps"
	"slices"
	"strings"

	"github.com/pingcap/tidb/pkg/expression"
	"github.com/pingcap/tidb/pkg/meta/model"
	"github.com/pingcap/tidb/pkg/parser/mysql"
	"github.com/pingcap/tidb/pkg/planner/planctx"
	"github.com/pingcap/tidb/pkg/types"
	"github.com/pingcap/tidb/pkg/util/ranger"
	"go.uber.org/atomic"
)

const (
	// PseudoVersion means the pseudo statistics version is 0.
	PseudoVersion uint64 = 0

	// PseudoRowCount export for other pkg to use.
	// When we haven't analyzed a table, we use pseudo statistics to estimate costs.
	// It has row count 10000, equal condition selects 1/1000 of total rows, less condition selects 1/3 of total rows,
	// between condition selects 1/40 of total rows.
	PseudoRowCount = 10000
)

// CopyIntent specifies what data structures are safe to modify in the copied table.
type CopyIntent uint8

const (
	// MetaOnly shares all maps - only table metadata is safe to modify
	MetaOnly CopyIntent = iota

	// ColumnMapWritable clones columns map - safe to add/remove columns
	ColumnMapWritable

	// IndexMapWritable clones indices map - safe to add/remove indices
	IndexMapWritable

	// BothMapsWritable clones both maps - safe to add/remove columns and indices
	BothMapsWritable

	// ExtendedStatsWritable shares all maps - safe to modify ExtendedStats field
	ExtendedStatsWritable

	// AllDataWritable deep copies everything - safe to modify all data including histograms
	AllDataWritable
)

// AutoAnalyzeMinCnt means if the count of table is less than this value, we don't need to do auto analyze.
// Exported for testing.
var AutoAnalyzeMinCnt int64 = 1000

var (
	// Below functions are used to solve cycle import problem.
	// Note: all functions below will be removed after finishing moving all estimation functions into the cardinality package.

	// GetRowCountByIndexRanges is a function type to get row count by index ranges.
	GetRowCountByIndexRanges func(sctx planctx.PlanContext, coll *HistColl, idxID int64, indexRanges []*ranger.Range, idxCol []*expression.Column) (result RowEstimate, err error)

	// GetRowCountByColumnRanges is a function type to get row count by column ranges.
	GetRowCountByColumnRanges func(sctx planctx.PlanContext, coll *HistColl, colID int64, colRanges []*ranger.Range, pkIsHandle bool) (result RowEstimate, err error)
)

// Table represents statistics for a table.
type Table struct {
	ExtendedStats *ExtendedStatsColl

	ColAndIdxExistenceMap *ColAndIdxExistenceMap
	HistColl
	Version uint64
	// It's the timestamp of the last analyze time.
	// We used it in auto-analyze to determine if this table has been analyzed.
	// The source of this field comes from two parts:
	// 1. Initialized by snapshot when loading stats_meta.
	// 2. Updated by the analysis time of a specific column or index when loading the histogram of the column or index.
	LastAnalyzeVersion uint64
	// LastStatsHistVersion is the mvcc version of the last update of histograms.
	// It differs from LastAnalyzeVersion because it can be influenced by some DDL.
	// e.g. When we execute ALTER TABLE ADD COLUMN, there'll be new record inserted into mysql.stats_histograms.
	//      We need to load the corresponding one into memory too.
	// It's used to skip redundant loading of stats, i.e, if the cached stats is already update-to-date with mysql.stats_xxx tables,
	// and the schema of the table does not change, we don't need to load the stats for this table again.
	// Stats' sync load/async load should not change this field since they are not table-level update.
	// It's hard to deal with the upgrade compatibility of this field, the field will not take effect unless
	// auto analyze or DDL happened on the table.
	LastStatsHistVersion uint64
	// TblInfoUpdateTS is the UpdateTS of the TableInfo used when filling this struct.
	// It is the schema version of the corresponding table. It is used to skip redundant
	// loading of stats, i.e, if the cached stats is already update-to-date with mysql.stats_xxx tables,
	// and the schema of the table does not change, we don't need to load the stats for this
	// table again.
	// TODO: it can be removed now that we've have LastAnalyseVersion and LastStatsHistVersion.
	TblInfoUpdateTS uint64

	IsPkIsHandle bool
}

// ColAndIdxExistenceMap is the meta map for statistics.Table.
// It can tell whether a column/index really has its statistics. So we won't send useless kv request when we do online stats loading.
type ColAndIdxExistenceMap struct {
	colAnalyzed map[int64]bool
	idxAnalyzed map[int64]bool
}

// DeleteColNotFound deletes the column with the given id.
func (m *ColAndIdxExistenceMap) DeleteColNotFound(id int64) {
	delete(m.colAnalyzed, id)
}

// DeleteIdxNotFound deletes the index with the given id.
func (m *ColAndIdxExistenceMap) DeleteIdxNotFound(id int64) {
	delete(m.idxAnalyzed, id)
}

// HasAnalyzed checks whether a column/index stats exists and it has stats.
// TODO: the map should only keep the analyzed cols.
// There's three possible status of column/index's statistics:
//  1. We don't have this column/index.
//  2. We have it, but it hasn't been analyzed yet.
//  3. We have it and its statistics.
//
// To figure out three status, we use HasAnalyzed's TRUE value to represents the status 3. The Has's FALSE to represents the status 1.
// Begin from v8.5.2, the 1. case becomes a nearly invalid case. It's just a middle state between happening of the DDL and the completion of the stats' ddl handler.
// But we may need to deal with the 1. for the upgrade compatibility.
func (m *ColAndIdxExistenceMap) HasAnalyzed(id int64, isIndex bool) bool {
	if isIndex {
		analyzed, ok := m.idxAnalyzed[id]
		return ok && analyzed
	}
	analyzed, ok := m.colAnalyzed[id]
	return ok && analyzed
}

// Has checks whether a column/index stats exists.
func (m *ColAndIdxExistenceMap) Has(id int64, isIndex bool) bool {
	if isIndex {
		_, ok := m.idxAnalyzed[id]
		return ok
	}
	_, ok := m.colAnalyzed[id]
	return ok
}

// InsertCol inserts a column with its meta into the map.
func (m *ColAndIdxExistenceMap) InsertCol(id int64, analyzed bool) {
	m.colAnalyzed[id] = analyzed
}

// InsertIndex inserts an index with its meta into the map.
func (m *ColAndIdxExistenceMap) InsertIndex(id int64, analyzed bool) {
	m.idxAnalyzed[id] = analyzed
}

// IsEmpty checks whether the map is empty.
func (m *ColAndIdxExistenceMap) IsEmpty() bool {
	return len(m.colAnalyzed)+len(m.idxAnalyzed) == 0
}

// ColNum returns the number of columns in the map.
func (m *ColAndIdxExistenceMap) ColNum() int {
	return len(m.colAnalyzed)
}

// Clone deeply copies the map.
func (m *ColAndIdxExistenceMap) Clone() *ColAndIdxExistenceMap {
	mm := NewColAndIndexExistenceMap(len(m.colAnalyzed), len(m.idxAnalyzed))
	mm.colAnalyzed = maps.Clone(m.colAnalyzed)
	mm.idxAnalyzed = maps.Clone(m.idxAnalyzed)
	return mm
}

const (
	defaultColCap = 16
	defaultIdxCap = 4
)

// NewColAndIndexExistenceMapWithoutSize return a new object with default capacity.
func NewColAndIndexExistenceMapWithoutSize() *ColAndIdxExistenceMap {
	return &ColAndIdxExistenceMap{
		colAnalyzed: make(map[int64]bool, defaultColCap),
		idxAnalyzed: make(map[int64]bool, defaultIdxCap),
	}
}

// NewColAndIndexExistenceMap return a new object with the given capcity.
func NewColAndIndexExistenceMap(colCap, idxCap int) *ColAndIdxExistenceMap {
	return &ColAndIdxExistenceMap{
		colAnalyzed: make(map[int64]bool, colCap),
		idxAnalyzed: make(map[int64]bool, idxCap),
	}
}

// ColAndIdxExistenceMapIsEqual is used in testing, checking whether the two are equal.
func ColAndIdxExistenceMapIsEqual(m1, m2 *ColAndIdxExistenceMap) bool {
	return maps.Equal(m1.colAnalyzed, m2.colAnalyzed) && maps.Equal(m1.idxAnalyzed, m2.idxAnalyzed)
}

// ExtendedStatsItem is the cached item of a mysql.stats_extended record.
type ExtendedStatsItem struct {
	StringVals string
	ColIDs     []int64
	ScalarVals float64
	Tp         uint8
}

// ExtendedStatsColl is a collection of cached items for mysql.stats_extended records.
type ExtendedStatsColl struct {
	Stats             map[string]*ExtendedStatsItem
	LastUpdateVersion uint64
}

// NewExtendedStatsColl allocate an ExtendedStatsColl struct.
func NewExtendedStatsColl() *ExtendedStatsColl {
	return &ExtendedStatsColl{Stats: make(map[string]*ExtendedStatsItem)}
}

const (
	// ExtendedStatsInited is the status for extended stats which are just registered but have not been analyzed yet.
	ExtendedStatsInited uint8 = iota
	// ExtendedStatsAnalyzed is the status for extended stats which have been collected in analyze.
	ExtendedStatsAnalyzed
	// ExtendedStatsDeleted is the status for extended stats which were dropped. These "deleted" records would be removed from storage by GCStats().
	ExtendedStatsDeleted
)

// HistColl is a collection of histograms. It collects enough information for plan to calculate the selectivity.
type HistColl struct {
	// Note that when used in a query, Column use UniqueID as the key while Indices use the index ID in the
	// metadata. (See GenerateHistCollFromColumnInfo() for details)
	columns    map[int64]*Column
	indices    map[int64]*Index
	PhysicalID int64
	// TODO: add AnalyzeCount here
	RealtimeCount int64 // RealtimeCount is the current table row count, maintained by applying stats delta based on AnalyzeCount.
	ModifyCount   int64 // Total modify count in a table.

	// The version of the statistics, refer to Version0, Version1, Version2 and so on.
	StatsVer int
	Pseudo   bool

	/*
		Fields below are only used in a query, like for estimation, and they will be useless when stored in
		the stats cache. (See GenerateHistCollFromColumnInfo() for details)
	*/

	CanNotTriggerLoad bool
	// Idx2ColUniqueIDs maps the index id to its column UniqueIDs. It's used to calculate the selectivity in planner.
	Idx2ColUniqueIDs map[int64][]int64
	// ColUniqueID2IdxIDs maps the column UniqueID to a list index ids whose first column is it.
	// It's used to calculate the selectivity in planner.
	ColUniqueID2IdxIDs map[int64][]int64
	// UniqueID2colInfoID maps the column UniqueID to its ID in the metadata.
	UniqueID2colInfoID map[int64]int64
	// MVIdx2Columns maps the index id to its columns by expression.Column.
	// For normal index, the column id is enough, as we already have in Idx2ColUniqueIDs. But currently, mv index needs more
	// information to match the filter against the mv index columns, and we need this map to provide this information.
	MVIdx2Columns map[int64][]*expression.Column
}

// NewHistColl creates a new HistColl.
func NewHistColl(id int64, realtimeCnt, modifyCnt int64, colNum, idxNum int) *HistColl {
	return &HistColl{
		columns:            make(map[int64]*Column, colNum),
		indices:            make(map[int64]*Index, idxNum),
		PhysicalID:         id,
		RealtimeCount:      realtimeCnt,
		ModifyCount:        modifyCnt,
		Idx2ColUniqueIDs:   make(map[int64][]int64),
		ColUniqueID2IdxIDs: make(map[int64][]int64),
		UniqueID2colInfoID: make(map[int64]int64),
		MVIdx2Columns:      make(map[int64][]*expression.Column),
	}
}

// NewHistCollWithColsAndIdxs creates a new HistColl with given columns and indices.
func NewHistCollWithColsAndIdxs(id int64, realtimeCnt, modifyCnt int64, cols map[int64]*Column, idxs map[int64]*Index) *HistColl {
	return &HistColl{
		columns:            cols,
		indices:            idxs,
		PhysicalID:         id,
		RealtimeCount:      realtimeCnt,
		ModifyCount:        modifyCnt,
		Idx2ColUniqueIDs:   make(map[int64][]int64),
		ColUniqueID2IdxIDs: make(map[int64][]int64),
		UniqueID2colInfoID: make(map[int64]int64),
		MVIdx2Columns:      make(map[int64][]*expression.Column),
	}
}

// SetCol sets the column with the given id.
func (coll *HistColl) SetCol(id int64, col *Column) {
	coll.columns[id] = col
}

// SetIdx sets the index with the given id.
func (coll *HistColl) SetIdx(id int64, idx *Index) {
	coll.indices[id] = idx
}

// GetCol gets the column with the given id.
func (coll *HistColl) GetCol(id int64) *Column {
	return coll.columns[id]
}

// GetIdx gets the index with the given id.
func (coll *HistColl) GetIdx(id int64) *Index {
	return coll.indices[id]
}

// ForEachColumnImmutable iterates all columns in the HistColl.
// The bool return value of f is used to control the iteration. If f returns true, the iteration will be stopped.
// Warning: Don't change the content when calling this function.
func (coll *HistColl) ForEachColumnImmutable(f func(int64, *Column) bool) {
	for id, col := range coll.columns {
		if f(id, col) {
			return
		}
	}
}

// ForEachIndexImmutable iterates all columns in the HistColl.
// The bool return value of f is used to control the iteration. If f returns true, the iteration will be stopped.
// WARNING: Don't change the content when calling this function.
func (coll *HistColl) ForEachIndexImmutable(f func(int64, *Index) bool) {
	for id, idx := range coll.indices {
		if f(id, idx) {
			return
		}
	}
}

// ColNum returns the number of columns in the HistColl.
func (coll *HistColl) ColNum() int {
	return len(coll.columns)
}

// IdxNum returns the number of indices in the HistColl.
func (coll *HistColl) IdxNum() int {
	return len(coll.indices)
}

// DelCol deletes the column with the given id.
func (t *Table) DelCol(id int64) {
	delete(t.columns, id)
	t.ColAndIdxExistenceMap.DeleteColNotFound(id)
}

// DelIdx deletes the index with the given id.
func (t *Table) DelIdx(id int64) {
	delete(t.indices, id)
	t.ColAndIdxExistenceMap.DeleteIdxNotFound(id)
}

// StableOrderColSlice returns a slice of columns in stable order.
func (coll *HistColl) StableOrderColSlice() []*Column {
	cols := make([]*Column, 0, len(coll.columns))
	for _, col := range coll.columns {
		cols = append(cols, col)
	}
	slices.SortFunc(cols, func(c1, c2 *Column) int {
		return cmp.Compare(c1.ID, c2.ID)
	})
	return cols
}

// GetColSlice returns a slice of columns without order.
func (coll *HistColl) GetColSlice() []*Column {
	cols := make([]*Column, 0, len(coll.columns))
	for _, col := range coll.columns {
		cols = append(cols, col)
	}
	return cols
}

// StableOrderIdxSlice returns a slice of indices in stable order.
func (coll *HistColl) StableOrderIdxSlice() []*Index {
	idxs := make([]*Index, 0, len(coll.indices))
	for _, idx := range coll.indices {
		idxs = append(idxs, idx)
	}
	slices.SortFunc(idxs, func(i1, i2 *Index) int {
		return cmp.Compare(i1.ID, i2.ID)
	})
	return idxs
}

// GetIdxSlice returns a slice of indices without order.
func (coll *HistColl) GetIdxSlice() []*Index {
	idxs := make([]*Index, 0, len(coll.indices))
	for _, idx := range coll.indices {
		idxs = append(idxs, idx)
	}
	return idxs
}

// SetAllIndexFullLoadForBootstrap sets all indices' stats loaded status to full load for bootstrap.
func (coll *HistColl) SetAllIndexFullLoadForBootstrap() {
	for _, idx := range coll.indices {
		idx.StatsLoadedStatus = NewStatsFullLoadStatus()
	}
}

// CalcPreScalar calculates the pre-calculated scalar for all columns and indices.
func (coll *HistColl) CalcPreScalar() {
	for _, idx := range coll.indices {
		for i := 1; i < idx.Len(); i++ {
			idx.Buckets[i].Count += idx.Buckets[i-1].Count
		}
		idx.PreCalculateScalar()
	}
	for _, col := range coll.columns {
		for i := 1; i < col.Len(); i++ {
			col.Buckets[i].Count += col.Buckets[i-1].Count
		}
		col.PreCalculateScalar()
	}
}

// DropEvicted will drop the unnecessary data for all columns and indices. It's triggerred by stats cache.
func (coll *HistColl) DropEvicted() {
	for _, col := range coll.columns {
		if !col.IsStatsInitialized() || col.GetEvictedStatus() == AllEvicted {
			continue
		}
		col.DropUnnecessaryData()
	}
	for _, idx := range coll.indices {
		if !idx.IsStatsInitialized() || idx.GetEvictedStatus() == AllEvicted {
			continue
		}
		idx.DropUnnecessaryData()
	}
}

// TableMemoryUsage records tbl memory usage
type TableMemoryUsage struct {
	ColumnsMemUsage map[int64]CacheItemMemoryUsage
	IndicesMemUsage map[int64]CacheItemMemoryUsage
	TableID         int64
	TotalMemUsage   int64
}

// TotalIdxTrackingMemUsage returns total indices' tracking memory usage
func (t *TableMemoryUsage) TotalIdxTrackingMemUsage() (sum int64) {
	for _, idx := range t.IndicesMemUsage {
		sum += idx.TrackingMemUsage()
	}
	return sum
}

// TotalColTrackingMemUsage returns total columns' tracking memory usage
func (t *TableMemoryUsage) TotalColTrackingMemUsage() (sum int64) {
	for _, col := range t.ColumnsMemUsage {
		sum += col.TrackingMemUsage()
	}
	return sum
}

// TotalTrackingMemUsage return total tracking memory usage
func (t *TableMemoryUsage) TotalTrackingMemUsage() int64 {
	return t.TotalIdxTrackingMemUsage() + t.TotalColTrackingMemUsage()
}

// TableCacheItem indicates the unit item stored in statsCache, eg: Column/Index
type TableCacheItem interface {
	ItemID() int64
	MemoryUsage() CacheItemMemoryUsage
	IsAllEvicted() bool
	GetEvictedStatus() int

	DropUnnecessaryData()
	IsStatsInitialized() bool
	GetStatsVer() int64
}

// CacheItemMemoryUsage indicates the memory usage of TableCacheItem
type CacheItemMemoryUsage interface {
	ItemID() int64
	TotalMemoryUsage() int64
	TrackingMemUsage() int64
	HistMemUsage() int64
	TopnMemUsage() int64
	CMSMemUsage() int64
}

// ColumnMemUsage records column memory usage
type ColumnMemUsage struct {
	ColumnID          int64
	HistogramMemUsage int64
	CMSketchMemUsage  int64
	FMSketchMemUsage  int64
	TopNMemUsage      int64
	TotalMemUsage     int64
}

// TotalMemoryUsage implements CacheItemMemoryUsage
func (c *ColumnMemUsage) TotalMemoryUsage() int64 {
	return c.TotalMemUsage
}

// ItemID implements CacheItemMemoryUsage
func (c *ColumnMemUsage) ItemID() int64 {
	return c.ColumnID
}

// TrackingMemUsage implements CacheItemMemoryUsage
func (c *ColumnMemUsage) TrackingMemUsage() int64 {
	return c.CMSketchMemUsage + c.TopNMemUsage + c.HistogramMemUsage
}

// HistMemUsage implements CacheItemMemoryUsage
func (c *ColumnMemUsage) HistMemUsage() int64 {
	return c.HistogramMemUsage
}

// TopnMemUsage implements CacheItemMemoryUsage
func (c *ColumnMemUsage) TopnMemUsage() int64 {
	return c.TopNMemUsage
}

// CMSMemUsage implements CacheItemMemoryUsage
func (c *ColumnMemUsage) CMSMemUsage() int64 {
	return c.CMSketchMemUsage
}

// IndexMemUsage records index memory usage
type IndexMemUsage struct {
	IndexID           int64
	HistogramMemUsage int64
	CMSketchMemUsage  int64
	TopNMemUsage      int64
	TotalMemUsage     int64
}

// TotalMemoryUsage implements CacheItemMemoryUsage
func (c *IndexMemUsage) TotalMemoryUsage() int64 {
	return c.TotalMemUsage
}

// ItemID implements CacheItemMemoryUsage
func (c *IndexMemUsage) ItemID() int64 {
	return c.IndexID
}

// TrackingMemUsage implements CacheItemMemoryUsage
func (c *IndexMemUsage) TrackingMemUsage() int64 {
	return c.CMSketchMemUsage + c.TopNMemUsage + c.HistogramMemUsage
}

// HistMemUsage implements CacheItemMemoryUsage
func (c *IndexMemUsage) HistMemUsage() int64 {
	return c.HistogramMemUsage
}

// TopnMemUsage implements CacheItemMemoryUsage
func (c *IndexMemUsage) TopnMemUsage() int64 {
	return c.TopNMemUsage
}

// CMSMemUsage implements CacheItemMemoryUsage
func (c *IndexMemUsage) CMSMemUsage() int64 {
	return c.CMSketchMemUsage
}

// MemoryUsage returns the total memory usage of this Table.
// it will only calc the size of Columns and Indices stats data of table.
// We ignore the size of other metadata in Table
func (t *Table) MemoryUsage() *TableMemoryUsage {
	tMemUsage := &TableMemoryUsage{
		TableID:         t.PhysicalID,
		ColumnsMemUsage: make(map[int64]CacheItemMemoryUsage),
		IndicesMemUsage: make(map[int64]CacheItemMemoryUsage),
	}
	for _, col := range t.columns {
		if col != nil {
			colMemUsage := col.MemoryUsage()
			tMemUsage.ColumnsMemUsage[colMemUsage.ItemID()] = colMemUsage
			tMemUsage.TotalMemUsage += colMemUsage.TotalMemoryUsage()
		}
	}
	for _, index := range t.indices {
		if index != nil {
			idxMemUsage := index.MemoryUsage()
			tMemUsage.IndicesMemUsage[idxMemUsage.ItemID()] = idxMemUsage
			tMemUsage.TotalMemUsage += idxMemUsage.TotalMemoryUsage()
		}
	}
	return tMemUsage
}

// CopyAs creates a copy of the table with the specified writability intent.
//
// PERFORMANCE NOTE: Choose the most minimal intent for your use case. Copying is heavily
// used at scale and unnecessary cloning causes significant memory pressure. Only use
// AllDataWritable when you truly need to modify histogram data.
//
// MetaOnly: Shares all maps, only metadata modifications are safe
// ColumnMapWritable: Clones columns map, safe to add/remove columns
// IndexMapWritable: Clones indices map, safe to add/remove indices
// BothMapsWritable: Clones both maps - safe to add/remove columns and indices
// ExtendedStatsWritable: Shares all maps, safe to modify ExtendedStats field
// AllDataWritable: Deep copies everything, safe to modify all data including histograms
func (t *Table) CopyAs(intent CopyIntent) *Table {
	var columns map[int64]*Column
	var indices map[int64]*Index
	var existenceMap *ColAndIdxExistenceMap

	switch intent {
	case MetaOnly:
		columns = t.columns
		indices = t.indices
		existenceMap = t.ColAndIdxExistenceMap
	case ColumnMapWritable:
		columns = maps.Clone(t.columns)
		indices = t.indices
		if t.ColAndIdxExistenceMap != nil {
			existenceMap = t.ColAndIdxExistenceMap.Clone()
		}
	case IndexMapWritable:
		columns = t.columns
		indices = maps.Clone(t.indices)
		if t.ColAndIdxExistenceMap != nil {
			existenceMap = t.ColAndIdxExistenceMap.Clone()
		}
	case BothMapsWritable:
		columns = maps.Clone(t.columns)
		indices = maps.Clone(t.indices)
		if t.ColAndIdxExistenceMap != nil {
			existenceMap = t.ColAndIdxExistenceMap.Clone()
		}
	case ExtendedStatsWritable:
		columns = t.columns
		indices = t.indices
		existenceMap = t.ColAndIdxExistenceMap
	case AllDataWritable:
		// For deep copy, create new maps and deep copy all content
		columns = make(map[int64]*Column, len(t.columns))
		for id, col := range t.columns {
			columns[id] = col.Copy()
		}
		indices = make(map[int64]*Index, len(t.indices))
		for id, idx := range t.indices {
			indices[id] = idx.Copy()
		}
		if t.ColAndIdxExistenceMap != nil {
			existenceMap = t.ColAndIdxExistenceMap.Clone()
		}
	}

	newHistColl := HistColl{
		PhysicalID:    t.PhysicalID,
		RealtimeCount: t.RealtimeCount,
		columns:       columns,
		indices:       indices,
		Pseudo:        t.Pseudo,
		ModifyCount:   t.ModifyCount,
		StatsVer:      t.StatsVer,
	}
	nt := &Table{
		HistColl:              newHistColl,
		Version:               t.Version,
		TblInfoUpdateTS:       t.TblInfoUpdateTS,
		ColAndIdxExistenceMap: existenceMap,
		LastAnalyzeVersion:    t.LastAnalyzeVersion,
		LastStatsHistVersion:  t.LastStatsHistVersion,
	}

	// Handle ExtendedStats for deep copy vs shallow copy
	if (intent == AllDataWritable || intent == ExtendedStatsWritable) && t.ExtendedStats != nil {
		newExtStatsColl := &ExtendedStatsColl{
			Stats:             make(map[string]*ExtendedStatsItem),
			LastUpdateVersion: t.ExtendedStats.LastUpdateVersion,
		}
		maps.Copy(newExtStatsColl.Stats, t.ExtendedStats.Stats)
		nt.ExtendedStats = newExtStatsColl
	} else {
		nt.ExtendedStats = t.ExtendedStats
	}

	return nt
}

// String implements Stringer interface.
func (t *Table) String() string {
	strs := make([]string, 0, len(t.columns)+1)
	strs = append(strs, fmt.Sprintf("Table:%d RealtimeCount:%d", t.PhysicalID, t.RealtimeCount))
	cols := make([]*Column, 0, len(t.columns))
	for _, col := range t.columns {
		cols = append(cols, col)
	}
	slices.SortFunc(cols, func(i, j *Column) int { return cmp.Compare(i.ID, j.ID) })
	for _, col := range cols {
		strs = append(strs, col.String())
	}
	idxs := make([]*Index, 0, len(t.indices))
	for _, idx := range t.indices {
		idxs = append(idxs, idx)
	}
	slices.SortFunc(idxs, func(i, j *Index) int { return cmp.Compare(i.ID, j.ID) })
	for _, idx := range idxs {
		strs = append(strs, idx.String())
	}
	// TODO: concat content of ExtendedStatsColl
	return strings.Join(strs, "\n")
}

// IndexStartWithColumn finds the first index whose first column is the given column.
func (t *Table) IndexStartWithColumn(colName string) *Index {
	for _, index := range t.indices {
		if index.Info.Columns[0].Name.L == colName {
			return index
		}
	}
	return nil
}

// ColumnByName finds the statistics.Column for the given column.
func (t *Table) ColumnByName(colName string) *Column {
	for _, c := range t.columns {
		if c.Info.Name.L == colName {
			return c
		}
	}
	return nil
}

// GetStatsInfo returns their statistics according to the ID of the column or index, including histogram, CMSketch, TopN and FMSketch.
//
//	needCopy: In order to protect the item in the cache from being damaged, we need to copy the item.
func (t *Table) GetStatsInfo(id int64, isIndex bool, needCopy bool) (*Histogram, *CMSketch, *TopN, *FMSketch, bool) {
	if isIndex {
		if idxStatsInfo, ok := t.indices[id]; ok {
			if needCopy {
				return idxStatsInfo.Histogram.Copy(),
					idxStatsInfo.CMSketch.Copy(), idxStatsInfo.TopN.Copy(), idxStatsInfo.FMSketch.Copy(), true
			}
			return &idxStatsInfo.Histogram,
				idxStatsInfo.CMSketch, idxStatsInfo.TopN, idxStatsInfo.FMSketch, true
		}
		// newly added index which is not analyzed yet
		return nil, nil, nil, nil, false
	}
	if colStatsInfo, ok := t.columns[id]; ok {
		if needCopy {
			return colStatsInfo.Histogram.Copy(), colStatsInfo.CMSketch.Copy(),
				colStatsInfo.TopN.Copy(), colStatsInfo.FMSketch.Copy(), true
		}
		return &colStatsInfo.Histogram, colStatsInfo.CMSketch,
			colStatsInfo.TopN, colStatsInfo.FMSketch, true
	}
	// newly added column which is not analyzed yet
	return nil, nil, nil, nil, false
}

// IsAnalyzed checks whether the table is analyzed or not by checking its last analyze's timestamp value.
// A valid timestamp must be greater than 0.
func (t *Table) IsAnalyzed() bool {
	return t.LastAnalyzeVersion > 0
}

// IsEligibleForAnalysis checks whether the table is eligible for analysis.
func (t *Table) IsEligibleForAnalysis() bool {
	// 1. If the statistics are either not loaded or are classified as pseudo, there is no need for analyze.
	//    Pseudo statistics can be created by the optimizer, so we need to double check it.
	// 2. If the table is too small, we don't want to waste time to analyze it.
	//    Leave the opportunity to other bigger tables.
	if !t.MeetAutoAnalyzeMinCnt() || t.Pseudo {
		return false
	}

	return true
}

// MeetAutoAnalyzeMinCnt checks whether the table meets the minimum count required for auto-analyze.
func (t *Table) MeetAutoAnalyzeMinCnt() bool {
	return t != nil && t.RealtimeCount >= AutoAnalyzeMinCnt
}

// GetAnalyzeRowCount tries to get the row count of a column or an index if possible.
// This method is useful because this row count doesn't consider the modify count.
func (coll *HistColl) GetAnalyzeRowCount() float64 {
	ids := slices.Collect(maps.Keys(coll.columns))
	slices.Sort(ids)
	for _, id := range ids {
		col := coll.columns[id]
		if col != nil && col.IsFullLoad() {
			return col.TotalRowCount()
		}
	}
	clear(ids)
	ids = slices.Grow(ids, len(coll.indices))
	ids = slices.AppendSeq(ids, maps.Keys(coll.indices))
	slices.Sort(ids)
	for _, id := range ids {
		idx := coll.indices[id]
		if idx == nil {
			continue
		}
		if idx.Info != nil && idx.Info.MVIndex {
			continue
		}
		if idx.IsFullLoad() {
			return idx.TotalRowCount()
		}
	}
	return -1
}

// GetScaledRealtimeAndModifyCnt scale the RealtimeCount and ModifyCount for some special indexes where the total row
// count is different from the total row count of the table. Currently, only the mv index is this case.
// Because we will use the RealtimeCount and ModifyCount during the estimation for ranges on this index (like the upper
// bound for the out-of-range estimation logic and the IncreaseFactor logic), we can't directly use the RealtimeCount and
// ModifyCount of the table. Instead, we should scale them before using.
// For example, if the table analyze row count is 1000 and realtime row count is 1500, and the mv index total count is 5000,
// when calculating the IncreaseFactor, it should be 1500/1000 = 1.5 for normal columns/indexes, and we should use the
// same 1.5 for mv index. But obviously, use 1500/5000 would be wrong, the correct calculation should be 7500/5000 = 1.5.
// So we add this function to get this 7500.
func (coll *HistColl) GetScaledRealtimeAndModifyCnt(idxStats *Index) (realtimeCnt, modifyCnt int64) {
	// In theory, we can apply this scale logic on all indexes. But currently, we only apply it on the mv index to avoid
	// any unexpected changes caused by factors like precision difference.
	if idxStats == nil || idxStats.Info == nil || !idxStats.Info.MVIndex || !idxStats.IsFullLoad() {
		return coll.RealtimeCount, coll.ModifyCount
	}
	analyzeRowCount := coll.GetAnalyzeRowCount()
	if analyzeRowCount <= 0 {
		return coll.RealtimeCount, coll.ModifyCount
	}
	idxTotalRowCount := idxStats.TotalRowCount()
	if idxTotalRowCount <= 0 {
		return coll.RealtimeCount, coll.ModifyCount
	}
	scale := idxTotalRowCount / analyzeRowCount
	return int64(float64(coll.RealtimeCount) * scale), int64(float64(coll.ModifyCount) * scale)
}

// GetStatsHealthy calculates stats healthy if the table stats is not pseudo.
// If the table stats is pseudo, it returns 0, false, otherwise it returns stats healthy, true.
func (t *Table) GetStatsHealthy() (int64, bool) {
	if t == nil || t.Pseudo {
		return 0, false
	}
	if !t.IsAnalyzed() {
		return 0, true
	}
	var healthy int64
	count := float64(t.RealtimeCount)
	if histCount := t.GetAnalyzeRowCount(); histCount > 0 {
		count = histCount
	}
	if float64(t.ModifyCount) < count {
		healthy = int64((1.0 - float64(t.ModifyCount)/count) * 100.0)
	} else if t.ModifyCount == 0 {
		healthy = 100
	}
	return healthy, true
}

// ColumnIsLoadNeeded checks whether the column needs trigger the async/sync load.
// The Column should be visible in the table and really has analyzed statistics in the storage.
// Also, if the stats has been loaded into the memory, we also don't need to load it.
// We return the Column together with the checking result, to avoid accessing the map multiple times.
// The first bool is whether we need to load it into memory. The second bool is whether this column has stats in the system table or not.
func (t *Table) ColumnIsLoadNeeded(id int64, fullLoad bool) (col *Column, loadNeeded, hasAnalyzed bool) {
	if t.Pseudo {
		return nil, false, false
	}
	hasAnalyzed = t.ColAndIdxExistenceMap.HasAnalyzed(id, false)
	col, ok := t.columns[id]
	if !ok {
		// If The column have no stats object in memory. We need to check it by existence map.
		// If existence map says it even has no unitialized record in storage, we don't need to do anything. => Has=false, HasAnalyzed=false
		// If existence map says it has analyzed stats, we need to load it from storage. => Has=true, HasAnalyzed=true
		// If existence map says it has no analyzed stats but have a uninitialized record in storage, we need to also create a fake object. => Has=true, HasAnalyzed=false
		return nil, t.ColAndIdxExistenceMap.Has(id, false), hasAnalyzed
	}

	// If it's not analyzed yet.
	// The real check condition: !ok && !hashAnalyzed.(Has must be true since we've have the memory object so we should have the storage object)
	// After this check, we will always have ok && hasAnalyzed.
	if !hasAnalyzed {
		return nil, false, false
	}

	// Restore the condition from the simplified form:
	// 1. ok && hasAnalyzed && fullLoad && !col.IsFullLoad => need load
	// 2. ok && hasAnalyzed && !fullLoad && !col.statsInitialized => need load
	if (fullLoad && !col.IsFullLoad()) || (!fullLoad && !col.statsInitialized) {
		return col, true, true
	}

	// Otherwise don't need load it.
	return col, false, true
}

// IndexIsLoadNeeded checks whether the index needs trigger the async/sync load.
// The Index should be visible in the table and really has analyzed statistics in the storage.
// Also, if the stats has been loaded into the memory, we also don't need to load it.
// We return the Index together with the checking result, to avoid accessing the map multiple times.
func (t *Table) IndexIsLoadNeeded(id int64) (*Index, bool) {
	idx, ok := t.indices[id]
	// If the index is not in the memory, and we have its stats in the storage. We need to trigger the load.
	if !ok && t.ColAndIdxExistenceMap.HasAnalyzed(id, true) {
		return nil, true
	}
	// If the index is in the memory, we check its embedded func.
	if ok && idx.IsAnalyzed() && !idx.IsFullLoad() {
		return idx, true
	}
	return idx, false
}

// RatioOfPseudoEstimate means if modifyCount / statsTblCount is greater than this ratio, we think the stats is invalid
// and use pseudo estimation.
var RatioOfPseudoEstimate = atomic.NewFloat64(0.7)

// IsInitialized returns true if any column/index stats of the table is initialized.
func (t *Table) IsInitialized() bool {
	for _, col := range t.columns {
		if col != nil && col.IsStatsInitialized() {
			return true
		}
	}
	for _, idx := range t.indices {
		if idx != nil && idx.IsStatsInitialized() {
			return true
		}
	}
	return false
}

// IsOutdated returns true if the table stats is outdated.
func (t *Table) IsOutdated() bool {
	rowcount := t.GetAnalyzeRowCount()
	if rowcount < 0 {
		rowcount = float64(t.RealtimeCount)
	}
	if rowcount > 0 && float64(t.ModifyCount)/rowcount > RatioOfPseudoEstimate.Load() {
		return true
	}
	return false
}

// ReleaseAndPutToPool releases data structures of Table and put itself back to pool.
func (t *Table) ReleaseAndPutToPool() {
	for _, col := range t.columns {
		col.FMSketch.DestroyAndPutToPool()
	}
	clear(t.columns)
	for _, idx := range t.indices {
		idx.FMSketch.DestroyAndPutToPool()
	}
	clear(t.indices)
}

// ID2UniqueID generates a new HistColl whose `Columns` is built from UniqueID of given columns.
func (coll *HistColl) ID2UniqueID(columns []*expression.Column) *HistColl {
	cols := make(map[int64]*Column)
	for _, col := range columns {
		colHist, ok := coll.columns[col.ID]
		if ok {
			cols[col.UniqueID] = colHist
		}
	}
	newColl := &HistColl{
		PhysicalID:    coll.PhysicalID,
		Pseudo:        coll.Pseudo,
		RealtimeCount: coll.RealtimeCount,
		ModifyCount:   coll.ModifyCount,
		columns:       cols,
	}
	return newColl
}

// GenerateHistCollFromColumnInfo generates a new HistColl whose ColUniqueID2IdxIDs and Idx2ColUniqueIDs is built from the given parameter.
func (coll *HistColl) GenerateHistCollFromColumnInfo(tblInfo *model.TableInfo, columns []*expression.Column) *HistColl {
	newColHistMap := make(map[int64]*Column)
	colInfoID2Col := make(map[int64]*expression.Column, len(columns))
	colInfoID2UniqueID := make(map[int64]int64, len(columns))
	uniqueID2colInfoID := make(map[int64]int64, len(columns))
	idxID2idxInfo := make(map[int64]*model.IndexInfo)
	for _, col := range columns {
		colInfoID2Col[col.ID] = col
		colInfoID2UniqueID[col.ID] = col.UniqueID
		uniqueID2colInfoID[col.UniqueID] = col.ID
	}
	for id, colHist := range coll.columns {
		uniqueID, ok := colInfoID2UniqueID[id]
		// Collect the statistics by the given columns.
		if ok {
			newColHistMap[uniqueID] = colHist
		}
	}
	for _, idxInfo := range tblInfo.Indices {
		idxID2idxInfo[idxInfo.ID] = idxInfo
	}
	newIdxHistMap := make(map[int64]*Index)
	idx2Columns := make(map[int64][]int64)
	colID2IdxIDs := make(map[int64][]int64)
	mvIdx2Columns := make(map[int64][]*expression.Column)
	for id, idxHist := range coll.indices {
		idxInfo := idxID2idxInfo[id]
		if idxInfo == nil {
			continue
		}
		ids := make([]int64, 0, len(idxInfo.Columns))
		for _, idxCol := range idxInfo.Columns {
			uniqueID, ok := colInfoID2UniqueID[tblInfo.Columns[idxCol.Offset].ID]
			if !ok {
				break
			}
			ids = append(ids, uniqueID)
		}
		// If the length of the id list is 0, this index won't be used in this query.
		if len(ids) == 0 {
			continue
		}
		colID2IdxIDs[ids[0]] = append(colID2IdxIDs[ids[0]], idxHist.ID)
		newIdxHistMap[idxHist.ID] = idxHist
		idx2Columns[idxHist.ID] = ids
		if idxInfo.MVIndex {
			cols, ok := PrepareCols4MVIndex(tblInfo, idxInfo, colInfoID2Col, true)
			if ok {
				mvIdx2Columns[id] = cols
			}
		}
	}
	for _, idxIDs := range colID2IdxIDs {
		slices.Sort(idxIDs)
	}
	newColl := &HistColl{
		PhysicalID:         coll.PhysicalID,
		Pseudo:             coll.Pseudo,
		RealtimeCount:      coll.RealtimeCount,
		ModifyCount:        coll.ModifyCount,
		columns:            newColHistMap,
		indices:            newIdxHistMap,
		ColUniqueID2IdxIDs: colID2IdxIDs,
		Idx2ColUniqueIDs:   idx2Columns,
		UniqueID2colInfoID: uniqueID2colInfoID,
		MVIdx2Columns:      mvIdx2Columns,
	}
	return newColl
}

// PseudoHistColl creates a lightweight pseudo HistColl for cost calculation.
// This is optimized for cases where only HistColl is needed, avoiding the overhead
// of creating a full pseudo table with ColAndIdxExistenceMap and other structures.
func PseudoHistColl(physicalID int64, allowTriggerLoading bool) HistColl {
	return HistColl{
		RealtimeCount:     PseudoRowCount,
		PhysicalID:        physicalID,
		columns:           nil,
		indices:           nil,
		Pseudo:            true,
		CanNotTriggerLoad: !allowTriggerLoading,
		ModifyCount:       0,
		StatsVer:          0,
	}
}

// PseudoTable creates a pseudo table statistics.
// Usually, we don't want to trigger stats loading for pseudo table.
// But there are exceptional cases. In such cases, we should pass allowTriggerLoading as true.
// Such case could possibly happen in getStatsTable().
func PseudoTable(tblInfo *model.TableInfo, allowTriggerLoading bool, allowFillHistMeta bool) *Table {
	t := &Table{
		HistColl:              PseudoHistColl(tblInfo.ID, allowTriggerLoading),
		Version:               PseudoVersion,
		ColAndIdxExistenceMap: NewColAndIndexExistenceMap(len(tblInfo.Columns), len(tblInfo.Indices)),
	}

	// Initialize columns and indices maps only when allowFillHistMeta is true
	if allowFillHistMeta {
		t.columns = make(map[int64]*Column, len(tblInfo.Columns))
		t.indices = make(map[int64]*Index, len(tblInfo.Indices))
	}

	for _, col := range tblInfo.Columns {
		// The column is public to use. Also we should check the column is not hidden since hidden means that it's used by expression index.
		// We would not collect stats for the hidden column and we won't use the hidden column to estimate.
		// Thus we don't create pseudo stats for it.
		if col.State == model.StatePublic && !col.Hidden {
			t.ColAndIdxExistenceMap.InsertCol(col.ID, false)
			if allowFillHistMeta {
				t.columns[col.ID] = &Column{
					PhysicalID: tblInfo.ID,
					Info:       col,
					IsHandle:   tblInfo.PKIsHandle && mysql.HasPriKeyFlag(col.GetFlag()),
					Histogram:  *NewPseudoHistogram(col.ID, &col.FieldType),
				}
			}
		}
	}
	for _, idx := range tblInfo.Indices {
		if idx.State == model.StatePublic {
			t.ColAndIdxExistenceMap.InsertIndex(idx.ID, false)
			if allowFillHistMeta {
				t.indices[idx.ID] = &Index{
					PhysicalID: tblInfo.ID,
					Info:       idx,
					Histogram:  *NewPseudoHistogram(idx.ID, types.NewFieldType(mysql.TypeBlob)),
				}
			}
		}
	}
	return t
}

// CheckAnalyzeVerOnTable checks whether the given version is the one from the tbl.
// If not, it will return false and set the version to the tbl's.
// We use this check to make sure all the statistics of the table are in the same version.
func CheckAnalyzeVerOnTable(tbl *Table, version *int) bool {
	if IsAnalyzed(int64(tbl.StatsVer)) && tbl.StatsVer != *version {
		*version = tbl.StatsVer
		return false
	}
	return true
}

// PrepareCols4MVIndex helps to identify the columns of an MV index. We need this information for estimation.
// This logic is shared between the estimation logic and the access path generation logic. We'd like to put the mv index
// related functions together in the planner/core package. So we use this trick here to avoid the import cycle.
var PrepareCols4MVIndex func(
	tableInfo *model.TableInfo,
	mvIndex *model.IndexInfo,
	tblColsByID map[int64]*expression.Column,
	checkOnly1ArrayTypeCol bool,
) (idxCols []*expression.Column, ok bool)