statistics: fix some problem related to stats async load (#57723)

close pingcap/tidb#57722, close pingcap/tidb#57735
This commit is contained in:
Yiding Cui
2024-11-27 23:25:23 +08:00
committed by GitHub
parent ca395fa4be
commit 2b03447f19
6 changed files with 69 additions and 32 deletions

View File

@ -82,7 +82,7 @@ go_test(
data = glob(["testdata/**"]),
embed = [":statistics"],
flaky = True,
shard_count = 37,
shard_count = 38,
deps = [
"//pkg/config",
"//pkg/meta/model",

View File

@ -260,3 +260,13 @@ func (c *Column) StatsAvailable() bool {
// StatsVer, so we check NDV > 0 || NullCount > 0 for the case.
return c.IsAnalyzed() || c.NDV > 0 || c.NullCount > 0
}
// EmptyColumn creates an empty column object. It may be used for pseudo estimation or to stop loading unexisting stats.
func EmptyColumn(tid int64, pkIsHandle bool, colInfo *model.ColumnInfo) *Column {
return &Column{
PhysicalID: tid,
Info: colInfo,
Histogram: *NewHistogram(colInfo.ID, 0, 0, 0, &colInfo.FieldType, 0, 0),
IsHandle: pkIsHandle && mysql.HasPriKeyFlag(colInfo.GetFlag()),
}
}

View File

@ -632,30 +632,38 @@ func CleanFakeItemsForShowHistInFlights(statsCache statstypes.StatsCache) int {
}
func loadNeededColumnHistograms(sctx sessionctx.Context, statsHandle statstypes.StatsHandle, col model.TableItemID, loadFMSketch bool, fullLoad bool) (err error) {
tbl, ok := statsHandle.Get(col.TableID)
statsTbl, ok := statsHandle.Get(col.TableID)
if !ok {
return nil
}
var colInfo *model.ColumnInfo
_, loadNeeded, analyzed := tbl.ColumnIsLoadNeeded(col.ID, true)
if !loadNeeded || !analyzed {
asyncload.AsyncLoadHistogramNeededItems.Delete(col)
return nil
}
// Now, we cannot init the column info in the ColAndIdxExistenceMap when to disable lite-init-stats.
// so we have to get the column info from the domain.
is := sctx.GetDomainInfoSchema().(infoschema.InfoSchema)
tblInfo, ok := statsHandle.TableInfoByID(is, col.TableID)
tbl, ok := statsHandle.TableInfoByID(is, col.TableID)
if !ok {
return nil
}
colInfo = tblInfo.Meta().GetColumnByID(col.ID)
tblInfo := tbl.Meta()
colInfo := tblInfo.GetColumnByID(col.ID)
if colInfo == nil {
asyncload.AsyncLoadHistogramNeededItems.Delete(col)
return nil
}
_, loadNeeded, analyzed := statsTbl.ColumnIsLoadNeeded(col.ID, true)
if !loadNeeded || !analyzed {
// If this column is not analyzed yet and we don't have it in memory.
// We create a fake one for the pseudo estimation.
// Otherwise, it will trigger the sync/async load again, even if the column has not been analyzed.
if loadNeeded && !analyzed {
fakeCol := statistics.EmptyColumn(tblInfo.ID, tblInfo.PKIsHandle, colInfo)
statsTbl.SetCol(col.ID, fakeCol)
statsHandle.UpdateStatsCache([]*statistics.Table{statsTbl}, nil)
}
asyncload.AsyncLoadHistogramNeededItems.Delete(col)
return nil
}
hg, _, statsVer, _, err := HistMetaFromStorageWithHighPriority(sctx, &col, colInfo)
if hg == nil || err != nil {
asyncload.AsyncLoadHistogramNeededItems.Delete(col)
@ -690,29 +698,29 @@ func loadNeededColumnHistograms(sctx sessionctx.Context, statsHandle statstypes.
CMSketch: cms,
TopN: topN,
FMSketch: fms,
IsHandle: tblInfo.Meta().PKIsHandle && mysql.HasPriKeyFlag(colInfo.GetFlag()),
IsHandle: tblInfo.PKIsHandle && mysql.HasPriKeyFlag(colInfo.GetFlag()),
StatsVer: statsVer,
}
// Reload the latest stats cache, otherwise the `updateStatsCache` may fail with high probability, because functions
// like `GetPartitionStats` called in `fmSketchFromStorage` would have modified the stats cache already.
tbl, ok = statsHandle.Get(col.TableID)
statsTbl, ok = statsHandle.Get(col.TableID)
if !ok {
return nil
}
tbl = tbl.Copy()
statsTbl = statsTbl.Copy()
if colHist.StatsAvailable() {
if fullLoad {
colHist.StatsLoadedStatus = statistics.NewStatsFullLoadStatus()
} else {
colHist.StatsLoadedStatus = statistics.NewStatsAllEvictedStatus()
}
tbl.LastAnalyzeVersion = max(tbl.LastAnalyzeVersion, colHist.LastUpdateVersion)
if statsVer != statistics.Version0 {
tbl.StatsVer = int(statsVer)
statsTbl.LastAnalyzeVersion = max(statsTbl.LastAnalyzeVersion, colHist.LastUpdateVersion)
statsTbl.StatsVer = int(statsVer)
}
}
tbl.SetCol(col.ID, colHist)
statsHandle.UpdateStatsCache([]*statistics.Table{tbl}, nil)
statsTbl.SetCol(col.ID, colHist)
statsHandle.UpdateStatsCache([]*statistics.Table{statsTbl}, nil)
asyncload.AsyncLoadHistogramNeededItems.Delete(col)
if col.IsSyncLoadFailed {
logutil.BgLogger().Warn("Hist for column should already be loaded as sync but not found.",
@ -771,9 +779,9 @@ func loadNeededIndexHistograms(sctx sessionctx.Context, is infoschema.InfoSchema
tbl = tbl.Copy()
if idxHist.StatsVer != statistics.Version0 {
tbl.StatsVer = int(idxHist.StatsVer)
tbl.LastAnalyzeVersion = max(tbl.LastAnalyzeVersion, idxHist.LastUpdateVersion)
}
tbl.SetIdx(idx.ID, idxHist)
tbl.LastAnalyzeVersion = max(tbl.LastAnalyzeVersion, idxHist.LastUpdateVersion)
statsHandle.UpdateStatsCache([]*statistics.Table{tbl}, nil)
if idx.IsSyncLoadFailed {
logutil.BgLogger().Warn("Hist for index should already be loaded as sync but not found.",

View File

@ -357,13 +357,9 @@ func (s *statsSyncLoad) handleOneItemTask(task *statstypes.NeededItemTask) (err
// If this column is not analyzed yet and we don't have it in memory.
// We create a fake one for the pseudo estimation.
// Otherwise, it will trigger the sync/async load again, even if the column has not been analyzed.
if loadNeeded && !analyzed {
wrapper.col = &statistics.Column{
PhysicalID: item.TableID,
Info: wrapper.colInfo,
Histogram: *statistics.NewHistogram(item.ID, 0, 0, 0, &wrapper.colInfo.FieldType, 0, 0),
IsHandle: isPkIsHandle && mysql.HasPriKeyFlag(wrapper.colInfo.GetFlag()),
}
wrapper.col = statistics.EmptyColumn(item.TableID, isPkIsHandle, wrapper.colInfo)
s.updateCachedItem(tblInfo, item, wrapper.col, wrapper.idx, task.Item.FullLoad)
return nil
}

View File

@ -590,3 +590,25 @@ func TestGlobalIndexWithAnalyzeVersion1AndHistoricalStats(t *testing.T) {
// Each analyze will only generate one record
tk.MustQuery(fmt.Sprintf("select count(*) from mysql.stats_history where table_id=%d", tblID)).Equal(testkit.Rows("10"))
}
func TestLastAnalyzeVersionNotChangedWithAsyncStatsLoad(t *testing.T) {
store, dom := testkit.CreateMockStoreAndDomain(t)
tk := testkit.NewTestKit(t, store)
tk.MustExec("set @@tidb_stats_load_sync_wait = 0;")
tk.MustExec("use test")
tk.MustExec("create table t(a int, b int);")
require.NoError(t, dom.StatsHandle().HandleDDLEvent(<-dom.StatsHandle().DDLEventCh()))
require.NoError(t, dom.StatsHandle().Update(context.Background(), dom.InfoSchema()))
tk.MustExec("insert into t values (1, 1);")
err := dom.StatsHandle().DumpStatsDeltaToKV(true)
require.NoError(t, err)
tk.MustExec("alter table t add column c int default 1;")
dom.StatsHandle().HandleDDLEvent(<-dom.StatsHandle().DDLEventCh())
tk.MustExec("select * from t where a = 1 or b = 1 or c = 1;")
require.NoError(t, dom.StatsHandle().LoadNeededHistograms(dom.InfoSchema()))
result := tk.MustQuery("show stats_meta where table_name = 't'")
require.Len(t, result.Rows(), 1)
// The last analyze time.
require.Equal(t, "<nil>", result.Rows()[0][6])
}

View File

@ -811,7 +811,7 @@ func (t *Table) GetStatsHealthy() (int64, bool) {
}
// ColumnIsLoadNeeded checks whether the column needs trigger the async/sync load.
// The Column should be visible in the table and really has analyzed statistics in the stroage.
// The Column should be visible in the table and really has analyzed statistics in the storage.
// Also, if the stats has been loaded into the memory, we also don't need to load it.
// We return the Column together with the checking result, to avoid accessing the map multiple times.
// The first bool is whether we need to load it into memory. The second bool is whether this column has stats in the system table or not.
@ -820,7 +820,7 @@ func (t *Table) ColumnIsLoadNeeded(id int64, fullLoad bool) (*Column, bool, bool
return nil, false, false
}
// when we use non-lite init stats, it cannot init the stats for common columns.
// so we need to foce to load the stats.
// so we need to force to load the stats.
col, ok := t.columns[id]
if !ok {
return nil, true, true
@ -828,15 +828,16 @@ func (t *Table) ColumnIsLoadNeeded(id int64, fullLoad bool) (*Column, bool, bool
hasAnalyzed := t.ColAndIdxExistenceMap.HasAnalyzed(id, false)
// If it's not analyzed yet.
// The real check condition: !ok && !hashAnalyzed.
// After this check, we will always have ok && hasAnalyzed.
if !hasAnalyzed {
return nil, false, false
}
// Restore the condition from the simplified form:
// 1. !ok && hasAnalyzed => need load
// 2. ok && hasAnalyzed && fullLoad && !col.IsFullLoad => need load
// 3. ok && hasAnalyzed && !fullLoad && !col.statsInitialized => need load
if !ok || (fullLoad && !col.IsFullLoad()) || (!fullLoad && !col.statsInitialized) {
// 1. ok && hasAnalyzed && fullLoad && !col.IsFullLoad => need load
// 2. ok && hasAnalyzed && !fullLoad && !col.statsInitialized => need load
if (fullLoad && !col.IsFullLoad()) || (!fullLoad && !col.statsInitialized) {
return col, true, true
}