planner, statistics: refine stats loaded status and when to use pseudo stats (#37444)

close pingcap/tidb#37485
This commit is contained in:
Yifan Xu
2022-09-01 20:56:24 +08:00
committed by GitHub
parent 835922a5cd
commit 7d0318cbd3
11 changed files with 202 additions and 57 deletions

View File

@ -183,6 +183,9 @@ func (e *ShowExec) appendTableForStatsHistograms(dbName, tblName, partitionName
col.StatsLoadedStatus.StatusToString(), col.MemoryUsage())
}
for _, idx := range stableIdxsStats(statsTbl.Indices) {
if !idx.IsStatsInitialized() {
continue
}
e.histogramToRow(dbName, tblName, partitionName, idx.Info.Name.O, 1, idx.Histogram, 0,
idx.StatsLoadedStatus.StatusToString(), idx.MemoryUsage())
}

View File

@ -4144,15 +4144,20 @@ func getStatsTable(ctx sessionctx.Context, tblInfo *model.TableInfo, pid int64)
return statistics.PseudoTable(tblInfo)
}
// 3. statistics is outdated.
if ctx.GetSessionVars().GetEnablePseudoForOutdatedStats() {
if statsTbl.IsOutdated() {
tbl := *statsTbl
tbl.Pseudo = true
statsTbl = &tbl
// 3. statistics is uninitialized or outdated.
pseudoStatsForUninitialized := !statsTbl.IsInitialized()
pseudoStatsForOutdated := ctx.GetSessionVars().GetEnablePseudoForOutdatedStats() && statsTbl.IsOutdated()
if pseudoStatsForUninitialized || pseudoStatsForOutdated {
tbl := *statsTbl
tbl.Pseudo = true
statsTbl = &tbl
if pseudoStatsForUninitialized {
pseudoEstimationNotAvailable.Inc()
} else {
pseudoEstimationOutdate.Inc()
}
}
return statsTbl
}

View File

@ -112,14 +112,16 @@ func (h *Handle) initStatsHistograms4Chunk(is infoschema.InfoSchema, cache *stat
}
hist := statistics.NewHistogram(id, ndv, nullCount, version, types.NewFieldType(mysql.TypeBlob), chunk.InitialCapacity, 0)
index := &statistics.Index{
Histogram: *hist,
CMSketch: cms,
TopN: topN,
Info: idxInfo,
StatsVer: statsVer,
Flag: row.GetInt64(10),
PhysicalID: tblID,
StatsLoadedStatus: statistics.NewStatsFullLoadStatus(),
Histogram: *hist,
CMSketch: cms,
TopN: topN,
Info: idxInfo,
StatsVer: statsVer,
Flag: row.GetInt64(10),
PhysicalID: tblID,
}
if statsVer != statistics.Version0 {
index.StatsLoadedStatus = statistics.NewStatsFullLoadStatus()
}
lastAnalyzePos.Copy(&index.LastAnalyzePos)
table.Indices[hist.ID] = index
@ -430,6 +432,18 @@ func (h *Handle) InitStats(is infoschema.InfoSchema) (err error) {
if err != nil {
return errors.Trace(err)
}
// Set columns' stats status.
for _, table := range cache.Values() {
for _, col := range table.Columns {
if col.StatsVer != statistics.Version0 || col.Count > 0 {
if mysql.HasPriKeyFlag(col.Info.GetFlag()) {
col.StatsLoadedStatus = statistics.NewStatsFullLoadStatus()
} else {
col.StatsLoadedStatus = statistics.NewStatsAllEvictedStatus()
}
}
}
}
cache.FreshMemUsage()
h.updateStatsCache(cache)
v := h.statsCache.Load()

View File

@ -118,6 +118,7 @@ func TestDDLHistogram(t *testing.T) {
tableInfo := tbl.Meta()
statsTbl := do.StatsHandle().GetTableStats(tableInfo)
require.False(t, statsTbl.Pseudo)
require.True(t, statsTbl.Columns[tableInfo.Columns[2].ID].IsStatsInitialized())
require.Equal(t, int64(2), statsTbl.Columns[tableInfo.Columns[2].ID].NullCount)
require.Equal(t, int64(0), statsTbl.Columns[tableInfo.Columns[2].ID].Histogram.NDV)
@ -131,6 +132,7 @@ func TestDDLHistogram(t *testing.T) {
tableInfo = tbl.Meta()
statsTbl = do.StatsHandle().GetTableStats(tableInfo)
require.False(t, statsTbl.Pseudo)
require.True(t, statsTbl.Columns[tableInfo.Columns[3].ID].IsStatsInitialized())
sctx := mock.NewContext()
count, err := statsTbl.ColumnEqualRowCount(sctx, types.NewIntDatum(0), tableInfo.Columns[3].ID)
require.NoError(t, err)
@ -161,6 +163,7 @@ func TestDDLHistogram(t *testing.T) {
tableInfo = tbl.Meta()
statsTbl = do.StatsHandle().GetTableStats(tableInfo)
require.False(t, statsTbl.Pseudo)
require.True(t, statsTbl.Columns[tableInfo.Columns[5].ID].IsStatsInitialized())
require.Equal(t, 3.0, statsTbl.Columns[tableInfo.Columns[5].ID].AvgColSize(statsTbl.Count, false))
testKit.MustExec("alter table t add column c6 varchar(15) DEFAULT '123', add column c7 varchar(15) DEFAULT '123'")

View File

@ -699,19 +699,24 @@ func (h *Handle) loadNeededColumnHistograms(reader *statsReader, col model.Table
logutil.BgLogger().Error("fail to get stats version for this histogram", zap.Int64("table_id", col.TableID), zap.Int64("hist_id", col.ID))
return errors.Trace(fmt.Errorf("fail to get stats version for this histogram, table_id:%v, hist_id:%v", col.TableID, col.ID))
}
statsVer := rows[0].GetInt64(0)
colHist := &statistics.Column{
PhysicalID: col.TableID,
Histogram: *hg,
Info: c.Info,
CMSketch: cms,
TopN: topN,
FMSketch: fms,
IsHandle: c.IsHandle,
StatsVer: rows[0].GetInt64(0),
StatsLoadedStatus: statistics.NewStatsFullLoadStatus(),
PhysicalID: col.TableID,
Histogram: *hg,
Info: c.Info,
CMSketch: cms,
TopN: topN,
FMSketch: fms,
IsHandle: c.IsHandle,
StatsVer: statsVer,
}
// Column.Count is calculated by Column.TotalRowCount(). Hence we don't set Column.Count when initializing colHist.
colHist.Count = int64(colHist.TotalRowCount())
// When adding/modifying a column, we create its stats(all values are default values) without setting stats_ver.
// So we need add colHist.Count > 0 here.
if statsVer != statistics.Version0 || colHist.Count > 0 {
colHist.StatsLoadedStatus = statistics.NewStatsFullLoadStatus()
}
// Reload the latest stats cache, otherwise the `updateStatsCache` may fail with high probability, because functions
// like `GetPartitionStats` called in `fmSketchFromStorage` would have modified the stats cache already.
oldCache = h.statsCache.Load().(statsCache)
@ -835,6 +840,7 @@ func (h *Handle) indexStatsFromStorage(reader *statsReader, row chunk.Row, table
distinct := row.GetInt64(3)
histVer := row.GetUint64(4)
nullCount := row.GetInt64(5)
statsVer := row.GetInt64(7)
idx := table.Indices[histID]
errorRate := statistics.ErrorRate{}
flag := row.GetInt64(8)
@ -861,10 +867,20 @@ func (h *Handle) indexStatsFromStorage(reader *statsReader, row chunk.Row, table
if err != nil {
return errors.Trace(err)
}
idx = &statistics.Index{Histogram: *hg, CMSketch: cms, TopN: topN, FMSketch: fmSketch,
Info: idxInfo, ErrorRate: errorRate, StatsVer: row.GetInt64(7), Flag: flag,
PhysicalID: table.PhysicalID,
StatsLoadedStatus: statistics.NewStatsFullLoadStatus()}
idx = &statistics.Index{
Histogram: *hg,
CMSketch: cms,
TopN: topN,
FMSketch: fmSketch,
Info: idxInfo,
ErrorRate: errorRate,
StatsVer: statsVer,
Flag: flag,
PhysicalID: table.PhysicalID,
}
if statsVer != statistics.Version0 {
idx.StatsLoadedStatus = statistics.NewStatsFullLoadStatus()
}
lastAnalyzePos.Copy(&idx.LastAnalyzePos)
}
break
@ -923,6 +939,11 @@ func (h *Handle) columnStatsFromStorage(reader *statsReader, row chunk.Row, tabl
Flag: flag,
StatsVer: statsVer,
}
// When adding/modifying a column, we create its stats(all values are default values) without setting stats_ver.
// So we need add col.Count > 0 here.
if statsVer != statistics.Version0 || col.Count > 0 {
col.StatsLoadedStatus = statistics.NewStatsAllEvictedStatus()
}
lastAnalyzePos.Copy(&col.LastAnalyzePos)
col.Histogram.Correlation = correlation
break
@ -946,20 +967,24 @@ func (h *Handle) columnStatsFromStorage(reader *statsReader, row chunk.Row, tabl
}
}
col = &statistics.Column{
PhysicalID: table.PhysicalID,
Histogram: *hg,
Info: colInfo,
CMSketch: cms,
TopN: topN,
FMSketch: fmSketch,
ErrorRate: errorRate,
IsHandle: tableInfo.PKIsHandle && mysql.HasPriKeyFlag(colInfo.GetFlag()),
Flag: flag,
StatsVer: statsVer,
StatsLoadedStatus: statistics.NewStatsFullLoadStatus(),
PhysicalID: table.PhysicalID,
Histogram: *hg,
Info: colInfo,
CMSketch: cms,
TopN: topN,
FMSketch: fmSketch,
ErrorRate: errorRate,
IsHandle: tableInfo.PKIsHandle && mysql.HasPriKeyFlag(colInfo.GetFlag()),
Flag: flag,
StatsVer: statsVer,
}
// Column.Count is calculated by Column.TotalRowCount(). Hence we don't set Column.Count when initializing col.
col.Count = int64(col.TotalRowCount())
// When adding/modifying a column, we create its stats(all values are default values) without setting stats_ver.
// So we need add colHist.Count > 0 here.
if statsVer != statistics.Version0 || col.Count > 0 {
col.StatsLoadedStatus = statistics.NewStatsFullLoadStatus()
}
lastAnalyzePos.Copy(&col.LastAnalyzePos)
break
}

View File

@ -345,34 +345,42 @@ func (h *Handle) readStatsForOneItem(item model.TableItemID, w *statsWrapper, re
zap.Int64("hist_id", item.ID), zap.Bool("is_index", item.IsIndex))
return nil, errors.Trace(fmt.Errorf("fail to get stats version for this histogram, table_id:%v, hist_id:%v, is_index:%v", item.TableID, item.ID, item.IsIndex))
}
statsVer := rows[0].GetInt64(0)
if item.IsIndex {
idxHist := &statistics.Index{
Histogram: *hg,
CMSketch: cms,
TopN: topN,
FMSketch: fms,
Info: index.Info,
ErrorRate: index.ErrorRate,
StatsVer: rows[0].GetInt64(0), Flag: index.Flag,
PhysicalID: index.PhysicalID,
StatsLoadedStatus: statistics.NewStatsFullLoadStatus(),
Histogram: *hg,
CMSketch: cms,
TopN: topN,
FMSketch: fms,
Info: index.Info,
ErrorRate: index.ErrorRate,
StatsVer: statsVer,
Flag: index.Flag,
PhysicalID: index.PhysicalID,
}
if statsVer != statistics.Version0 {
idxHist.StatsLoadedStatus = statistics.NewStatsFullLoadStatus()
}
index.LastAnalyzePos.Copy(&idxHist.LastAnalyzePos)
w.idx = idxHist
} else {
colHist := &statistics.Column{
PhysicalID: item.TableID,
Histogram: *hg,
Info: c.Info,
CMSketch: cms,
TopN: topN,
FMSketch: fms,
IsHandle: c.IsHandle,
StatsVer: rows[0].GetInt64(0),
StatsLoadedStatus: statistics.NewStatsFullLoadStatus(),
PhysicalID: item.TableID,
Histogram: *hg,
Info: c.Info,
CMSketch: cms,
TopN: topN,
FMSketch: fms,
IsHandle: c.IsHandle,
StatsVer: statsVer,
}
// Column.Count is calculated by Column.TotalRowCount(). Hence, we don't set Column.Count when initializing colHist.
colHist.Count = int64(colHist.TotalRowCount())
// When adding/modifying a column, we create its stats(all values are default values) without setting stats_ver.
// So we need add colHist.Count > 0 here.
if statsVer != statistics.Version0 || colHist.Count > 0 {
colHist.StatsLoadedStatus = statistics.NewStatsFullLoadStatus()
}
w.col = colHist
}
return w, nil

View File

@ -3357,3 +3357,38 @@ func TestAnalyzeTableLRUPut(t *testing.T) {
tk.MustExec("analyze table test.t")
require.Equal(t, tbl.Meta().ID, domain.GetDomain(tk.Session()).StatsHandle().GetStatsCacheFrontTable())
}
func TestUninitializedStatsStatus(t *testing.T) {
store, dom := testkit.CreateMockStoreAndDomain(t)
dom.StatsHandle().SetLease(0)
tk := testkit.NewTestKit(t, store)
tk.MustExec("use test")
tk.MustExec("drop table if exists t")
tk.MustExec("create table t(a int, b int, c int, index idx_a(a))")
h := dom.StatsHandle()
require.NoError(t, h.HandleDDLEvent(<-h.DDLEventCh()))
tk.MustExec("insert into t values (1,2,2), (3,4,4), (5,6,6), (7,8,8), (9,10,10)")
require.NoError(t, h.DumpStatsDeltaToKV(handle.DumpAll))
is := dom.InfoSchema()
require.NoError(t, h.Update(is))
tbl, err := is.TableByName(model.NewCIStr("test"), model.NewCIStr("t"))
require.NoError(t, err)
tblInfo := tbl.Meta()
tblStats := h.GetTableStats(tblInfo)
for _, col := range tblStats.Columns {
require.False(t, col.IsStatsInitialized())
}
for _, idx := range tblStats.Indices {
require.False(t, idx.IsStatsInitialized())
}
tk.MustQuery("show stats_histograms where db_name = 'test' and table_name = 't'").Check(testkit.Rows())
checkStatsPseudo := func() {
rows := tk.MustQuery("explain select * from t").Rows()
operatorInfo := rows[len(rows)-1][4].(string)
require.True(t, strings.Contains(operatorInfo, "stats:pseudo"))
}
tk.MustExec("set @@tidb_enable_pseudo_for_outdated_stats = true")
checkStatsPseudo()
tk.MustExec("set @@tidb_enable_pseudo_for_outdated_stats = false")
checkStatsPseudo()
}

View File

@ -1910,7 +1910,9 @@ func TestLoadHistCorrelation(t *testing.T) {
h.Clear()
require.NoError(t, h.Update(dom.InfoSchema()))
result := testKit.MustQuery("show stats_histograms where Table_name = 't'")
require.Len(t, result.Rows(), 0)
// After https://github.com/pingcap/tidb/pull/37444, `show stats_histograms` displays the columns whose hist/topn/cmsketch
// are not loaded and their stats status is allEvicted.
require.Len(t, result.Rows(), 1)
testKit.MustExec("explain select * from t where c = 1")
require.NoError(t, h.LoadNeededHistograms())
result = testKit.MustQuery("show stats_histograms where Table_name = 't'")

View File

@ -1570,6 +1570,16 @@ func NewStatsFullLoadStatus() StatsLoadedStatus {
}
}
// NewStatsAllEvictedStatus returns the status that only loads count/nullCount/NDV and doesn't load CMSketch/TopN/Histogram.
// When we load table stats, column stats is in allEvicted status by default. CMSketch/TopN/Histogram of column is only
// loaded when we really need column stats.
func NewStatsAllEvictedStatus() StatsLoadedStatus {
return StatsLoadedStatus{
statsInitialized: true,
evictedStatus: allEvicted,
}
}
// IsStatsInitialized indicates whether the column/index's statistics was loaded from storage before.
// Note that `IsStatsInitialized` only can be set in initializing
func (s StatsLoadedStatus) IsStatsInitialized() bool {

View File

@ -20,6 +20,7 @@ import (
"strconv"
"strings"
"testing"
"time"
"github.com/pingcap/failpoint"
"github.com/pingcap/tidb/parser/model"
@ -629,3 +630,27 @@ func TestCrossValidationSelectivity(t *testing.T) {
"└─Selection 0.00 cop[tikv] gt(test.t.c, 1000)",
" └─TableRangeScan 2.00 cop[tikv] table:t range:(1 0,1 1000), keep order:false"))
}
func TestShowHistogramsLoadStatus(t *testing.T) {
store, dom := testkit.CreateMockStoreAndDomain(t)
tk := testkit.NewTestKit(t, store)
h := dom.StatsHandle()
origLease := h.Lease()
h.SetLease(time.Second)
defer func() { h.SetLease(origLease) }()
tk.MustExec("use test")
tk.MustExec("create table t(a int primary key, b int, c int, index idx(b, c))")
require.NoError(t, h.HandleDDLEvent(<-h.DDLEventCh()))
tk.MustExec("insert into t values (1,2,3), (4,5,6)")
require.NoError(t, h.DumpStatsDeltaToKV(handle.DumpAll))
tk.MustExec("analyze table t")
require.NoError(t, h.Update(dom.InfoSchema()))
rows := tk.MustQuery("show stats_histograms where db_name = 'test' and table_name = 't'").Rows()
for _, row := range rows {
if row[3] == "a" || row[3] == "idx" {
require.Equal(t, "allLoaded", row[10].(string))
} else {
require.Equal(t, "allEvicted", row[10].(string))
}
}
}

View File

@ -463,6 +463,21 @@ func (n *neededStatsMap) Length() int {
// and use pseudo estimation.
var RatioOfPseudoEstimate = atomic.NewFloat64(0.7)
// IsInitialized returns true if any column/index stats of the table is initialized.
func (t *Table) IsInitialized() bool {
for _, col := range t.Columns {
if col != nil && col.IsStatsInitialized() {
return true
}
}
for _, idx := range t.Indices {
if idx != nil && idx.IsStatsInitialized() {
return true
}
}
return false
}
// IsOutdated returns true if the table stats is outdated.
func (t *Table) IsOutdated() bool {
rowcount := t.GetColRowCount()