From 7d0318cbd33f45e602e232049abacdffdaffbb5d Mon Sep 17 00:00:00 2001 From: Yifan Xu <30385241+xuyifangreeneyes@users.noreply.github.com> Date: Thu, 1 Sep 2022 20:56:24 +0800 Subject: [PATCH] planner, statistics: refine stats loaded status and when to use pseudo stats (#37444) close pingcap/tidb#37485 --- executor/show_stats.go | 3 ++ planner/core/logical_plan_builder.go | 17 ++++--- statistics/handle/bootstrap.go | 30 +++++++++--- statistics/handle/ddl_test.go | 3 ++ statistics/handle/handle.go | 73 +++++++++++++++++++--------- statistics/handle/handle_hist.go | 44 ++++++++++------- statistics/handle/handle_test.go | 35 +++++++++++++ statistics/handle/update_test.go | 4 +- statistics/histogram.go | 10 ++++ statistics/integration_test.go | 25 ++++++++++ statistics/table.go | 15 ++++++ 11 files changed, 202 insertions(+), 57 deletions(-) diff --git a/executor/show_stats.go b/executor/show_stats.go index 5f0adb235d..597c6923cc 100644 --- a/executor/show_stats.go +++ b/executor/show_stats.go @@ -183,6 +183,9 @@ func (e *ShowExec) appendTableForStatsHistograms(dbName, tblName, partitionName col.StatsLoadedStatus.StatusToString(), col.MemoryUsage()) } for _, idx := range stableIdxsStats(statsTbl.Indices) { + if !idx.IsStatsInitialized() { + continue + } e.histogramToRow(dbName, tblName, partitionName, idx.Info.Name.O, 1, idx.Histogram, 0, idx.StatsLoadedStatus.StatusToString(), idx.MemoryUsage()) } diff --git a/planner/core/logical_plan_builder.go b/planner/core/logical_plan_builder.go index 51358486b8..13f55806d2 100644 --- a/planner/core/logical_plan_builder.go +++ b/planner/core/logical_plan_builder.go @@ -4144,15 +4144,20 @@ func getStatsTable(ctx sessionctx.Context, tblInfo *model.TableInfo, pid int64) return statistics.PseudoTable(tblInfo) } - // 3. statistics is outdated. - if ctx.GetSessionVars().GetEnablePseudoForOutdatedStats() { - if statsTbl.IsOutdated() { - tbl := *statsTbl - tbl.Pseudo = true - statsTbl = &tbl + // 3. statistics is uninitialized or outdated. + pseudoStatsForUninitialized := !statsTbl.IsInitialized() + pseudoStatsForOutdated := ctx.GetSessionVars().GetEnablePseudoForOutdatedStats() && statsTbl.IsOutdated() + if pseudoStatsForUninitialized || pseudoStatsForOutdated { + tbl := *statsTbl + tbl.Pseudo = true + statsTbl = &tbl + if pseudoStatsForUninitialized { + pseudoEstimationNotAvailable.Inc() + } else { pseudoEstimationOutdate.Inc() } } + return statsTbl } diff --git a/statistics/handle/bootstrap.go b/statistics/handle/bootstrap.go index 094a02bd1d..4aaeb05cf8 100644 --- a/statistics/handle/bootstrap.go +++ b/statistics/handle/bootstrap.go @@ -112,14 +112,16 @@ func (h *Handle) initStatsHistograms4Chunk(is infoschema.InfoSchema, cache *stat } hist := statistics.NewHistogram(id, ndv, nullCount, version, types.NewFieldType(mysql.TypeBlob), chunk.InitialCapacity, 0) index := &statistics.Index{ - Histogram: *hist, - CMSketch: cms, - TopN: topN, - Info: idxInfo, - StatsVer: statsVer, - Flag: row.GetInt64(10), - PhysicalID: tblID, - StatsLoadedStatus: statistics.NewStatsFullLoadStatus(), + Histogram: *hist, + CMSketch: cms, + TopN: topN, + Info: idxInfo, + StatsVer: statsVer, + Flag: row.GetInt64(10), + PhysicalID: tblID, + } + if statsVer != statistics.Version0 { + index.StatsLoadedStatus = statistics.NewStatsFullLoadStatus() } lastAnalyzePos.Copy(&index.LastAnalyzePos) table.Indices[hist.ID] = index @@ -430,6 +432,18 @@ func (h *Handle) InitStats(is infoschema.InfoSchema) (err error) { if err != nil { return errors.Trace(err) } + // Set columns' stats status. + for _, table := range cache.Values() { + for _, col := range table.Columns { + if col.StatsVer != statistics.Version0 || col.Count > 0 { + if mysql.HasPriKeyFlag(col.Info.GetFlag()) { + col.StatsLoadedStatus = statistics.NewStatsFullLoadStatus() + } else { + col.StatsLoadedStatus = statistics.NewStatsAllEvictedStatus() + } + } + } + } cache.FreshMemUsage() h.updateStatsCache(cache) v := h.statsCache.Load() diff --git a/statistics/handle/ddl_test.go b/statistics/handle/ddl_test.go index a075720b84..a93f0f00f7 100644 --- a/statistics/handle/ddl_test.go +++ b/statistics/handle/ddl_test.go @@ -118,6 +118,7 @@ func TestDDLHistogram(t *testing.T) { tableInfo := tbl.Meta() statsTbl := do.StatsHandle().GetTableStats(tableInfo) require.False(t, statsTbl.Pseudo) + require.True(t, statsTbl.Columns[tableInfo.Columns[2].ID].IsStatsInitialized()) require.Equal(t, int64(2), statsTbl.Columns[tableInfo.Columns[2].ID].NullCount) require.Equal(t, int64(0), statsTbl.Columns[tableInfo.Columns[2].ID].Histogram.NDV) @@ -131,6 +132,7 @@ func TestDDLHistogram(t *testing.T) { tableInfo = tbl.Meta() statsTbl = do.StatsHandle().GetTableStats(tableInfo) require.False(t, statsTbl.Pseudo) + require.True(t, statsTbl.Columns[tableInfo.Columns[3].ID].IsStatsInitialized()) sctx := mock.NewContext() count, err := statsTbl.ColumnEqualRowCount(sctx, types.NewIntDatum(0), tableInfo.Columns[3].ID) require.NoError(t, err) @@ -161,6 +163,7 @@ func TestDDLHistogram(t *testing.T) { tableInfo = tbl.Meta() statsTbl = do.StatsHandle().GetTableStats(tableInfo) require.False(t, statsTbl.Pseudo) + require.True(t, statsTbl.Columns[tableInfo.Columns[5].ID].IsStatsInitialized()) require.Equal(t, 3.0, statsTbl.Columns[tableInfo.Columns[5].ID].AvgColSize(statsTbl.Count, false)) testKit.MustExec("alter table t add column c6 varchar(15) DEFAULT '123', add column c7 varchar(15) DEFAULT '123'") diff --git a/statistics/handle/handle.go b/statistics/handle/handle.go index 16ab7a589a..b285033b7f 100644 --- a/statistics/handle/handle.go +++ b/statistics/handle/handle.go @@ -699,19 +699,24 @@ func (h *Handle) loadNeededColumnHistograms(reader *statsReader, col model.Table logutil.BgLogger().Error("fail to get stats version for this histogram", zap.Int64("table_id", col.TableID), zap.Int64("hist_id", col.ID)) return errors.Trace(fmt.Errorf("fail to get stats version for this histogram, table_id:%v, hist_id:%v", col.TableID, col.ID)) } + statsVer := rows[0].GetInt64(0) colHist := &statistics.Column{ - PhysicalID: col.TableID, - Histogram: *hg, - Info: c.Info, - CMSketch: cms, - TopN: topN, - FMSketch: fms, - IsHandle: c.IsHandle, - StatsVer: rows[0].GetInt64(0), - StatsLoadedStatus: statistics.NewStatsFullLoadStatus(), + PhysicalID: col.TableID, + Histogram: *hg, + Info: c.Info, + CMSketch: cms, + TopN: topN, + FMSketch: fms, + IsHandle: c.IsHandle, + StatsVer: statsVer, } // Column.Count is calculated by Column.TotalRowCount(). Hence we don't set Column.Count when initializing colHist. colHist.Count = int64(colHist.TotalRowCount()) + // When adding/modifying a column, we create its stats(all values are default values) without setting stats_ver. + // So we need add colHist.Count > 0 here. + if statsVer != statistics.Version0 || colHist.Count > 0 { + colHist.StatsLoadedStatus = statistics.NewStatsFullLoadStatus() + } // Reload the latest stats cache, otherwise the `updateStatsCache` may fail with high probability, because functions // like `GetPartitionStats` called in `fmSketchFromStorage` would have modified the stats cache already. oldCache = h.statsCache.Load().(statsCache) @@ -835,6 +840,7 @@ func (h *Handle) indexStatsFromStorage(reader *statsReader, row chunk.Row, table distinct := row.GetInt64(3) histVer := row.GetUint64(4) nullCount := row.GetInt64(5) + statsVer := row.GetInt64(7) idx := table.Indices[histID] errorRate := statistics.ErrorRate{} flag := row.GetInt64(8) @@ -861,10 +867,20 @@ func (h *Handle) indexStatsFromStorage(reader *statsReader, row chunk.Row, table if err != nil { return errors.Trace(err) } - idx = &statistics.Index{Histogram: *hg, CMSketch: cms, TopN: topN, FMSketch: fmSketch, - Info: idxInfo, ErrorRate: errorRate, StatsVer: row.GetInt64(7), Flag: flag, - PhysicalID: table.PhysicalID, - StatsLoadedStatus: statistics.NewStatsFullLoadStatus()} + idx = &statistics.Index{ + Histogram: *hg, + CMSketch: cms, + TopN: topN, + FMSketch: fmSketch, + Info: idxInfo, + ErrorRate: errorRate, + StatsVer: statsVer, + Flag: flag, + PhysicalID: table.PhysicalID, + } + if statsVer != statistics.Version0 { + idx.StatsLoadedStatus = statistics.NewStatsFullLoadStatus() + } lastAnalyzePos.Copy(&idx.LastAnalyzePos) } break @@ -923,6 +939,11 @@ func (h *Handle) columnStatsFromStorage(reader *statsReader, row chunk.Row, tabl Flag: flag, StatsVer: statsVer, } + // When adding/modifying a column, we create its stats(all values are default values) without setting stats_ver. + // So we need add col.Count > 0 here. + if statsVer != statistics.Version0 || col.Count > 0 { + col.StatsLoadedStatus = statistics.NewStatsAllEvictedStatus() + } lastAnalyzePos.Copy(&col.LastAnalyzePos) col.Histogram.Correlation = correlation break @@ -946,20 +967,24 @@ func (h *Handle) columnStatsFromStorage(reader *statsReader, row chunk.Row, tabl } } col = &statistics.Column{ - PhysicalID: table.PhysicalID, - Histogram: *hg, - Info: colInfo, - CMSketch: cms, - TopN: topN, - FMSketch: fmSketch, - ErrorRate: errorRate, - IsHandle: tableInfo.PKIsHandle && mysql.HasPriKeyFlag(colInfo.GetFlag()), - Flag: flag, - StatsVer: statsVer, - StatsLoadedStatus: statistics.NewStatsFullLoadStatus(), + PhysicalID: table.PhysicalID, + Histogram: *hg, + Info: colInfo, + CMSketch: cms, + TopN: topN, + FMSketch: fmSketch, + ErrorRate: errorRate, + IsHandle: tableInfo.PKIsHandle && mysql.HasPriKeyFlag(colInfo.GetFlag()), + Flag: flag, + StatsVer: statsVer, } // Column.Count is calculated by Column.TotalRowCount(). Hence we don't set Column.Count when initializing col. col.Count = int64(col.TotalRowCount()) + // When adding/modifying a column, we create its stats(all values are default values) without setting stats_ver. + // So we need add colHist.Count > 0 here. + if statsVer != statistics.Version0 || col.Count > 0 { + col.StatsLoadedStatus = statistics.NewStatsFullLoadStatus() + } lastAnalyzePos.Copy(&col.LastAnalyzePos) break } diff --git a/statistics/handle/handle_hist.go b/statistics/handle/handle_hist.go index 8e87308e22..1392590130 100644 --- a/statistics/handle/handle_hist.go +++ b/statistics/handle/handle_hist.go @@ -345,34 +345,42 @@ func (h *Handle) readStatsForOneItem(item model.TableItemID, w *statsWrapper, re zap.Int64("hist_id", item.ID), zap.Bool("is_index", item.IsIndex)) return nil, errors.Trace(fmt.Errorf("fail to get stats version for this histogram, table_id:%v, hist_id:%v, is_index:%v", item.TableID, item.ID, item.IsIndex)) } + statsVer := rows[0].GetInt64(0) if item.IsIndex { idxHist := &statistics.Index{ - Histogram: *hg, - CMSketch: cms, - TopN: topN, - FMSketch: fms, - Info: index.Info, - ErrorRate: index.ErrorRate, - StatsVer: rows[0].GetInt64(0), Flag: index.Flag, - PhysicalID: index.PhysicalID, - StatsLoadedStatus: statistics.NewStatsFullLoadStatus(), + Histogram: *hg, + CMSketch: cms, + TopN: topN, + FMSketch: fms, + Info: index.Info, + ErrorRate: index.ErrorRate, + StatsVer: statsVer, + Flag: index.Flag, + PhysicalID: index.PhysicalID, + } + if statsVer != statistics.Version0 { + idxHist.StatsLoadedStatus = statistics.NewStatsFullLoadStatus() } index.LastAnalyzePos.Copy(&idxHist.LastAnalyzePos) w.idx = idxHist } else { colHist := &statistics.Column{ - PhysicalID: item.TableID, - Histogram: *hg, - Info: c.Info, - CMSketch: cms, - TopN: topN, - FMSketch: fms, - IsHandle: c.IsHandle, - StatsVer: rows[0].GetInt64(0), - StatsLoadedStatus: statistics.NewStatsFullLoadStatus(), + PhysicalID: item.TableID, + Histogram: *hg, + Info: c.Info, + CMSketch: cms, + TopN: topN, + FMSketch: fms, + IsHandle: c.IsHandle, + StatsVer: statsVer, } // Column.Count is calculated by Column.TotalRowCount(). Hence, we don't set Column.Count when initializing colHist. colHist.Count = int64(colHist.TotalRowCount()) + // When adding/modifying a column, we create its stats(all values are default values) without setting stats_ver. + // So we need add colHist.Count > 0 here. + if statsVer != statistics.Version0 || colHist.Count > 0 { + colHist.StatsLoadedStatus = statistics.NewStatsFullLoadStatus() + } w.col = colHist } return w, nil diff --git a/statistics/handle/handle_test.go b/statistics/handle/handle_test.go index c2137802d0..95a663949d 100644 --- a/statistics/handle/handle_test.go +++ b/statistics/handle/handle_test.go @@ -3357,3 +3357,38 @@ func TestAnalyzeTableLRUPut(t *testing.T) { tk.MustExec("analyze table test.t") require.Equal(t, tbl.Meta().ID, domain.GetDomain(tk.Session()).StatsHandle().GetStatsCacheFrontTable()) } + +func TestUninitializedStatsStatus(t *testing.T) { + store, dom := testkit.CreateMockStoreAndDomain(t) + dom.StatsHandle().SetLease(0) + tk := testkit.NewTestKit(t, store) + tk.MustExec("use test") + tk.MustExec("drop table if exists t") + tk.MustExec("create table t(a int, b int, c int, index idx_a(a))") + h := dom.StatsHandle() + require.NoError(t, h.HandleDDLEvent(<-h.DDLEventCh())) + tk.MustExec("insert into t values (1,2,2), (3,4,4), (5,6,6), (7,8,8), (9,10,10)") + require.NoError(t, h.DumpStatsDeltaToKV(handle.DumpAll)) + is := dom.InfoSchema() + require.NoError(t, h.Update(is)) + tbl, err := is.TableByName(model.NewCIStr("test"), model.NewCIStr("t")) + require.NoError(t, err) + tblInfo := tbl.Meta() + tblStats := h.GetTableStats(tblInfo) + for _, col := range tblStats.Columns { + require.False(t, col.IsStatsInitialized()) + } + for _, idx := range tblStats.Indices { + require.False(t, idx.IsStatsInitialized()) + } + tk.MustQuery("show stats_histograms where db_name = 'test' and table_name = 't'").Check(testkit.Rows()) + checkStatsPseudo := func() { + rows := tk.MustQuery("explain select * from t").Rows() + operatorInfo := rows[len(rows)-1][4].(string) + require.True(t, strings.Contains(operatorInfo, "stats:pseudo")) + } + tk.MustExec("set @@tidb_enable_pseudo_for_outdated_stats = true") + checkStatsPseudo() + tk.MustExec("set @@tidb_enable_pseudo_for_outdated_stats = false") + checkStatsPseudo() +} diff --git a/statistics/handle/update_test.go b/statistics/handle/update_test.go index 8335574ba4..cba45b44fe 100644 --- a/statistics/handle/update_test.go +++ b/statistics/handle/update_test.go @@ -1910,7 +1910,9 @@ func TestLoadHistCorrelation(t *testing.T) { h.Clear() require.NoError(t, h.Update(dom.InfoSchema())) result := testKit.MustQuery("show stats_histograms where Table_name = 't'") - require.Len(t, result.Rows(), 0) + // After https://github.com/pingcap/tidb/pull/37444, `show stats_histograms` displays the columns whose hist/topn/cmsketch + // are not loaded and their stats status is allEvicted. + require.Len(t, result.Rows(), 1) testKit.MustExec("explain select * from t where c = 1") require.NoError(t, h.LoadNeededHistograms()) result = testKit.MustQuery("show stats_histograms where Table_name = 't'") diff --git a/statistics/histogram.go b/statistics/histogram.go index 78db8f1b72..2133ccad3b 100644 --- a/statistics/histogram.go +++ b/statistics/histogram.go @@ -1570,6 +1570,16 @@ func NewStatsFullLoadStatus() StatsLoadedStatus { } } +// NewStatsAllEvictedStatus returns the status that only loads count/nullCount/NDV and doesn't load CMSketch/TopN/Histogram. +// When we load table stats, column stats is in allEvicted status by default. CMSketch/TopN/Histogram of column is only +// loaded when we really need column stats. +func NewStatsAllEvictedStatus() StatsLoadedStatus { + return StatsLoadedStatus{ + statsInitialized: true, + evictedStatus: allEvicted, + } +} + // IsStatsInitialized indicates whether the column/index's statistics was loaded from storage before. // Note that `IsStatsInitialized` only can be set in initializing func (s StatsLoadedStatus) IsStatsInitialized() bool { diff --git a/statistics/integration_test.go b/statistics/integration_test.go index 18bc2efefd..7c9c35023c 100644 --- a/statistics/integration_test.go +++ b/statistics/integration_test.go @@ -20,6 +20,7 @@ import ( "strconv" "strings" "testing" + "time" "github.com/pingcap/failpoint" "github.com/pingcap/tidb/parser/model" @@ -629,3 +630,27 @@ func TestCrossValidationSelectivity(t *testing.T) { "└─Selection 0.00 cop[tikv] gt(test.t.c, 1000)", " └─TableRangeScan 2.00 cop[tikv] table:t range:(1 0,1 1000), keep order:false")) } + +func TestShowHistogramsLoadStatus(t *testing.T) { + store, dom := testkit.CreateMockStoreAndDomain(t) + tk := testkit.NewTestKit(t, store) + h := dom.StatsHandle() + origLease := h.Lease() + h.SetLease(time.Second) + defer func() { h.SetLease(origLease) }() + tk.MustExec("use test") + tk.MustExec("create table t(a int primary key, b int, c int, index idx(b, c))") + require.NoError(t, h.HandleDDLEvent(<-h.DDLEventCh())) + tk.MustExec("insert into t values (1,2,3), (4,5,6)") + require.NoError(t, h.DumpStatsDeltaToKV(handle.DumpAll)) + tk.MustExec("analyze table t") + require.NoError(t, h.Update(dom.InfoSchema())) + rows := tk.MustQuery("show stats_histograms where db_name = 'test' and table_name = 't'").Rows() + for _, row := range rows { + if row[3] == "a" || row[3] == "idx" { + require.Equal(t, "allLoaded", row[10].(string)) + } else { + require.Equal(t, "allEvicted", row[10].(string)) + } + } +} diff --git a/statistics/table.go b/statistics/table.go index 15745138b7..81cb4e9bf2 100644 --- a/statistics/table.go +++ b/statistics/table.go @@ -463,6 +463,21 @@ func (n *neededStatsMap) Length() int { // and use pseudo estimation. var RatioOfPseudoEstimate = atomic.NewFloat64(0.7) +// IsInitialized returns true if any column/index stats of the table is initialized. +func (t *Table) IsInitialized() bool { + for _, col := range t.Columns { + if col != nil && col.IsStatsInitialized() { + return true + } + } + for _, idx := range t.Indices { + if idx != nil && idx.IsStatsInitialized() { + return true + } + } + return false +} + // IsOutdated returns true if the table stats is outdated. func (t *Table) IsOutdated() bool { rowcount := t.GetColRowCount()