From fd45f737ccbdde1c69b4644e0d3da16f181952cc Mon Sep 17 00:00:00 2001 From: Yifan Xu <30385241+xuyifangreeneyes@users.noreply.github.com> Date: Fri, 10 Mar 2023 14:15:13 +0800 Subject: [PATCH] statistics: fix wrong column stats loading after analyze twice (#42076) close pingcap/tidb#42073 --- statistics/integration_test.go | 26 ++++++++++++++++++++++++++ statistics/interact_with_storage.go | 15 ++++++++++++++- 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/statistics/integration_test.go b/statistics/integration_test.go index 1c8672b790..569a23dbbb 100644 --- a/statistics/integration_test.go +++ b/statistics/integration_test.go @@ -693,3 +693,29 @@ func TestSingleColumnIndexNDV(t *testing.T) { require.Equal(t, expectedResults[i][2], row[7]) // null_count } } + +func TestColumnStatsLazyLoad(t *testing.T) { + store, dom := testkit.CreateMockStoreAndDomain(t) + tk := testkit.NewTestKit(t, store) + h := dom.StatsHandle() + originLease := h.Lease() + defer h.SetLease(originLease) + // Set `Lease` to `Millisecond` to enable column stats lazy load. + h.SetLease(time.Millisecond) + tk.MustExec("use test") + tk.MustExec("create table t(a int, b int)") + tk.MustExec("insert into t values (1,2), (3,4), (5,6), (7,8)") + require.NoError(t, h.HandleDDLEvent(<-h.DDLEventCh())) + tk.MustExec("analyze table t") + is := dom.InfoSchema() + tbl, err := is.TableByName(model.NewCIStr("test"), model.NewCIStr("t")) + require.NoError(t, err) + tblInfo := tbl.Meta() + c1 := tblInfo.Columns[0] + c2 := tblInfo.Columns[1] + require.True(t, h.GetTableStats(tblInfo).Columns[c1.ID].IsAllEvicted()) + require.True(t, h.GetTableStats(tblInfo).Columns[c2.ID].IsAllEvicted()) + tk.MustExec("analyze table t") + require.True(t, h.GetTableStats(tblInfo).Columns[c1.ID].IsAllEvicted()) + require.True(t, h.GetTableStats(tblInfo).Columns[c2.ID].IsAllEvicted()) +} diff --git a/statistics/interact_with_storage.go b/statistics/interact_with_storage.go index c0acce31b7..1ad1b82080 100644 --- a/statistics/interact_with_storage.go +++ b/statistics/interact_with_storage.go @@ -334,10 +334,23 @@ func columnStatsFromStorage(reader *StatsReader, row chunk.Row, table *Table, ta // 2. this column is not handle, and: // 3. the column doesn't has any statistics before, and: // 4. loadAll is false. + // + // Here is the explanation of the condition `!col.IsStatsInitialized() || col.IsAllEvicted()`. + // For one column: + // 1. If there is no stats for it in the storage(i.e., analyze has never been executed before), then its stats status + // would be `!col.IsStatsInitialized()`. In this case we should go the `notNeedLoad` path. + // 2. If there exists stats for it in the storage but its stats status is `col.IsAllEvicted()`, there are two + // sub cases for this case. One is that the column stats have never been used/needed by the optimizer so they have + // never been loaded. The other is that the column stats were loaded and then evicted. For the both sub cases, + // we should go the `notNeedLoad` path. + // 3. If some parts(Histogram/TopN/CMSketch) of stats for it exist in TiDB memory currently, we choose to load all of + // its new stats once we find stats version is updated. notNeedLoad := lease > 0 && !isHandle && - (col == nil || !col.IsStatsInitialized() && col.LastUpdateVersion < histVer) && + (col == nil || ((!col.IsStatsInitialized() || col.IsAllEvicted()) && col.LastUpdateVersion < histVer)) && !loadAll + // Here is + //For one column, if there is no stats for it in the storage(analyze is never) if notNeedLoad { count, err := ColumnCountFromStorage(reader, table.PhysicalID, histID, statsVer) if err != nil {