executor: also use approximate count from pd to decide the sample rate (#29230)

This commit is contained in:
Yiding Cui
2021-11-01 11:08:51 +08:00
committed by GitHub
parent a18977a93b
commit afa7dfb10f
5 changed files with 111 additions and 11 deletions

View File

@ -1132,3 +1132,30 @@ func (s *testSuite10) TestSnapshotAnalyze(c *C) {
c.Assert(s3Str, Equals, s2Str)
c.Assert(failpoint.Disable("github.com/pingcap/tidb/executor/injectAnalyzeSnapshot"), IsNil)
}
func (s *testSuite10) TestAdjustSampleRateNote(c *C) {
tk := testkit.NewTestKit(c, s.store)
tk.MustExec("use test")
statsHandle := domain.GetDomain(tk.Se.(sessionctx.Context)).StatsHandle()
tk.MustExec("drop table if exists t")
tk.MustExec("create table t(a int, index index_a(a))")
c.Assert(statsHandle.HandleDDLEvent(<-statsHandle.DDLEventCh()), IsNil)
is := tk.Se.(sessionctx.Context).GetInfoSchema().(infoschema.InfoSchema)
tbl, err := is.TableByName(model.NewCIStr("test"), model.NewCIStr("t"))
c.Assert(err, IsNil)
tblInfo := tbl.Meta()
tid := tblInfo.ID
tk.MustExec(fmt.Sprintf("update mysql.stats_meta set count = 220000 where table_id=%d", tid))
c.Assert(statsHandle.Update(is), IsNil)
result := tk.MustQuery("show stats_meta where table_name = 't'")
c.Assert(result.Rows()[0][5], Equals, "220000")
tk.MustExec("analyze table t")
tk.MustQuery("show warnings").Check(testkit.Rows("Note 1105 Analyze use auto adjusted sample rate 0.500000 for table test.t."))
tk.MustExec("insert into t values(1),(1),(1)")
c.Assert(statsHandle.DumpStatsDeltaToKV(handle.DumpAll), IsNil)
c.Assert(statsHandle.Update(is), IsNil)
result = tk.MustQuery("show stats_meta where table_name = 't'")
c.Assert(result.Rows()[0][5], Equals, "3")
tk.MustExec("analyze table t")
tk.MustQuery("show warnings").Check(testkit.Rows("Note 1105 Analyze use auto adjusted sample rate 1.000000 for table test.t."))
}

View File

@ -45,6 +45,7 @@ import (
"github.com/pingcap/tidb/sessionctx"
"github.com/pingcap/tidb/sessionctx/stmtctx"
"github.com/pingcap/tidb/statistics"
"github.com/pingcap/tidb/store/helper"
"github.com/pingcap/tidb/table"
"github.com/pingcap/tidb/table/tables"
"github.com/pingcap/tidb/table/temptable"
@ -2225,6 +2226,22 @@ func (b *executorBuilder) buildAnalyzeSamplingPushdown(task plannercore.AnalyzeC
*sampleRate = math.Float64frombits(opts[ast.AnalyzeOptSampleRate])
if *sampleRate < 0 {
*sampleRate = b.getAdjustedSampleRate(b.ctx, task.TableID.GetStatisticsID(), task.TblInfo)
if task.PartitionName != "" {
sc.AppendNote(errors.Errorf(
"Analyze use auto adjusted sample rate %f for table %s.%s's partition %s.",
*sampleRate,
task.DBName,
task.TableName,
task.PartitionName,
))
} else {
sc.AppendNote(errors.Errorf(
"Analyze use auto adjusted sample rate %f for table %s.%s.",
*sampleRate,
task.DBName,
task.TableName,
))
}
}
}
e.analyzePB.ColReq = &tipb.AnalyzeColumnsReq{
@ -2245,6 +2262,7 @@ func (b *executorBuilder) buildAnalyzeSamplingPushdown(task plannercore.AnalyzeC
return &analyzeTask{taskType: colTask, colExec: e, job: job}
}
// getAdjustedSampleRate calculate the sample rate by the table size. If we cannot get the table size. We use the 0.001 as the default sample rate.
func (b *executorBuilder) getAdjustedSampleRate(sctx sessionctx.Context, tid int64, tblInfo *model.TableInfo) float64 {
statsHandle := domain.GetDomain(sctx).StatsHandle()
defaultRate := 0.001
@ -2257,17 +2275,47 @@ func (b *executorBuilder) getAdjustedSampleRate(sctx sessionctx.Context, tid int
} else {
statsTbl = statsHandle.GetPartitionStats(tblInfo, tid)
}
if statsTbl == nil {
approxiCount, hasPD := b.getApproximateTableCountFromPD(sctx, tid)
// If there's no stats meta and no pd, return the default rate.
if statsTbl == nil && !hasPD {
return defaultRate
}
// If the count in stats_meta is still 0, the table is not large, we scan all rows.
// If the count in stats_meta is still 0 and there's no information from pd side, we scan all rows.
if statsTbl.Count == 0 && !hasPD {
return 1
}
// we have issue https://github.com/pingcap/tidb/issues/29216.
// To do a workaround for this issue, we check the approxiCount from the pd side to do a comparison.
// If the count from the stats_meta is extremely smaller than the approximate count from the pd,
// we think that we meet this issue and use the approximate count to calculate the sample rate.
if float64(statsTbl.Count*100) < approxiCount {
// Confirmed by TiKV side, the experience error rate of the approximate count is about 20%.
// So we increase the number to 150000 to reduce this error rate.
return math.Min(1, 150000/approxiCount)
}
// If we don't go into the above if branch and we still detect the count is zero. Return 1 to prevent the dividing zero.
if statsTbl.Count == 0 {
return 1
}
// We are expected to scan about 100000 rows or so.
// Since there's tiny error rate around the count from the stats meta, we use 110000 to get a little big result
return math.Min(1, 110000/float64(statsTbl.Count))
}
func (b *executorBuilder) getApproximateTableCountFromPD(sctx sessionctx.Context, tid int64) (float64, bool) {
tikvStore, ok := sctx.GetStore().(helper.Storage)
if !ok {
return 0, false
}
regionStats := &helper.PDRegionStats{}
pdHelper := helper.NewHelper(tikvStore)
err := pdHelper.GetPDRegionStats(tid, regionStats)
if err != nil {
return 0, false
}
return float64(regionStats.StorageKeys), true
}
func (b *executorBuilder) buildAnalyzeColumnsPushdown(task plannercore.AnalyzeColumnsTask, opts map[ast.AnalyzeOptionType]uint64, autoAnalyze string, schemaForVirtualColEval *expression.Schema) *analyzeTask {
if task.StatsVersion == statistics.Version2 {
return b.buildAnalyzeSamplingPushdown(task, opts, autoAnalyze, schemaForVirtualColEval)

View File

@ -587,7 +587,7 @@ func (s *testInfoschemaTableSuite) TestForAnalyzeStatus(c *C) {
tk.MustExec("create table t1 (a int, b int, index idx(a))")
tk.MustExec("insert into t1 values (1,2),(3,4)")
tk.MustExec("analyze table t1")
tk.MustQuery("show warnings").Check(testkit.Rows()) // no warning
tk.MustQuery("show warnings").Check(testkit.Rows("Note 1105 Analyze use auto adjusted sample rate 1.000000 for table test.t1.")) // 1 note.
c.Assert(s.dom.StatsHandle().LoadNeededHistograms(), IsNil)
tk.MustExec("CREATE ROLE r_t1 ;")
tk.MustExec("GRANT ALL PRIVILEGES ON test.t1 TO r_t1;")

View File

@ -3997,9 +3997,10 @@ func (s *testIntegrationSuite) TestIncrementalAnalyzeStatsVer2(c *C) {
c.Assert(rows[0][0], Equals, "3")
tk.MustExec("insert into t values(4,4),(5,5),(6,6)")
tk.MustExec("analyze incremental table t index idx_b")
c.Assert(tk.Se.GetSessionVars().StmtCtx.GetWarnings(), HasLen, 2)
c.Assert(tk.Se.GetSessionVars().StmtCtx.GetWarnings(), HasLen, 3)
c.Assert(tk.Se.GetSessionVars().StmtCtx.GetWarnings()[0].Err.Error(), Equals, "The version 2 would collect all statistics not only the selected indexes")
c.Assert(tk.Se.GetSessionVars().StmtCtx.GetWarnings()[1].Err.Error(), Equals, "The version 2 stats would ignore the INCREMENTAL keyword and do full sampling")
c.Assert(tk.Se.GetSessionVars().StmtCtx.GetWarnings()[2].Err.Error(), Equals, "Analyze use auto adjusted sample rate 1.000000 for table test.t.")
rows = tk.MustQuery(fmt.Sprintf("select distinct_count from mysql.stats_histograms where table_id = %d and is_index = 1", tblID)).Rows()
c.Assert(len(rows), Equals, 1)
c.Assert(rows[0][0], Equals, "6")

View File

@ -2177,6 +2177,7 @@ func (s *testStatsSuite) TestFMSWithAnalyzePartition(c *C) {
tk.MustQuery("select count(*) from mysql.stats_fm_sketch").Check(testkit.Rows("0"))
tk.MustExec("analyze table t partition p0 with 1 topn, 2 buckets")
tk.MustQuery("show warnings").Sort().Check(testkit.Rows(
"Note 1105 Analyze use auto adjusted sample rate 1.000000 for table test.t's partition p0.",
"Warning 8131 Build table: `t` global-level stats failed due to missing partition-level stats",
"Warning 8131 Build table: `t` index: `a` global-level stats failed due to missing partition-level stats",
))
@ -3093,7 +3094,10 @@ func (s *testStatsSuite) TestAnalyzeColumnsWithPrimaryKey(c *C) {
tblID := tbl.Meta().ID
tk.MustExec("analyze table t columns a with 2 topn, 2 buckets")
tk.MustQuery("show warnings").Check(testkit.Rows("Warning 1105 Columns c are missing in ANALYZE but their stats are needed for calculating stats for indexes/primary key/extended stats."))
tk.MustQuery("show warnings").Sort().Check(testkit.Rows(
"Note 1105 Analyze use auto adjusted sample rate 1.000000 for table test.t.",
"Warning 1105 Columns c are missing in ANALYZE but their stats are needed for calculating stats for indexes/primary key/extended stats.",
))
rows := tk.MustQuery("show column_stats_usage where db_name = 'test' and table_name = 't' and last_analyzed_at is not null").Sort().Rows()
c.Assert(len(rows), Equals, 2)
c.Assert(rows[0][3], Equals, "a")
@ -3136,7 +3140,10 @@ func (s *testStatsSuite) TestAnalyzeColumnsWithIndex(c *C) {
tblID := tbl.Meta().ID
tk.MustExec("analyze table t columns c with 2 topn, 2 buckets")
tk.MustQuery("show warnings").Check(testkit.Rows("Warning 1105 Columns b,d are missing in ANALYZE but their stats are needed for calculating stats for indexes/primary key/extended stats."))
tk.MustQuery("show warnings").Sort().Check(testkit.Rows(
"Note 1105 Analyze use auto adjusted sample rate 1.000000 for table test.t.",
"Warning 1105 Columns b,d are missing in ANALYZE but their stats are needed for calculating stats for indexes/primary key/extended stats.",
))
rows := tk.MustQuery("show column_stats_usage where db_name = 'test' and table_name = 't' and last_analyzed_at is not null").Sort().Rows()
c.Assert(len(rows), Equals, 3)
c.Assert(rows[0][3], Equals, "b")
@ -3188,7 +3195,10 @@ func (s *testStatsSuite) TestAnalyzeColumnsWithClusteredIndex(c *C) {
tblID := tbl.Meta().ID
tk.MustExec("analyze table t columns c with 2 topn, 2 buckets")
tk.MustQuery("show warnings").Check(testkit.Rows("Warning 1105 Columns b,d are missing in ANALYZE but their stats are needed for calculating stats for indexes/primary key/extended stats."))
tk.MustQuery("show warnings").Sort().Check(testkit.Rows(
"Note 1105 Analyze use auto adjusted sample rate 1.000000 for table test.t.",
"Warning 1105 Columns b,d are missing in ANALYZE but their stats are needed for calculating stats for indexes/primary key/extended stats.",
))
rows := tk.MustQuery("show column_stats_usage where db_name = 'test' and table_name = 't' and last_analyzed_at is not null").Sort().Rows()
c.Assert(len(rows), Equals, 3)
c.Assert(rows[0][3], Equals, "b")
@ -3263,7 +3273,11 @@ func (s *testStatsSuite) TestAnalyzeColumnsWithDynamicPartitionTable(c *C) {
p1ID := defs[1].ID
tk.MustExec("analyze table t columns a with 2 topn, 2 buckets")
tk.MustQuery("show warnings").Check(testkit.Rows("Warning 1105 Columns c are missing in ANALYZE but their stats are needed for calculating stats for indexes/primary key/extended stats."))
tk.MustQuery("show warnings").Sort().Check(testkit.Rows(
"Note 1105 Analyze use auto adjusted sample rate 1.000000 for table test.t's partition p0.",
"Note 1105 Analyze use auto adjusted sample rate 1.000000 for table test.t's partition p1.",
"Warning 1105 Columns c are missing in ANALYZE but their stats are needed for calculating stats for indexes/primary key/extended stats.",
))
rows := tk.MustQuery("show column_stats_usage where db_name = 'test' and table_name = 't' and last_analyzed_at is not null").Sort().Rows()
c.Assert(len(rows), Equals, 6)
c.Assert(rows[0][:4], DeepEquals, []interface{}{"test", "t", "global", "a"})
@ -3361,7 +3375,11 @@ func (s *testStatsSuite) TestAnalyzeColumnsWithStaticPartitionTable(c *C) {
p1ID := defs[1].ID
tk.MustExec("analyze table t columns a with 2 topn, 2 buckets")
tk.MustQuery("show warnings").Check(testkit.Rows("Warning 1105 Columns c are missing in ANALYZE but their stats are needed for calculating stats for indexes/primary key/extended stats."))
tk.MustQuery("show warnings").Sort().Check(testkit.Rows(
"Note 1105 Analyze use auto adjusted sample rate 1.000000 for table test.t's partition p0.",
"Note 1105 Analyze use auto adjusted sample rate 1.000000 for table test.t's partition p1.",
"Warning 1105 Columns c are missing in ANALYZE but their stats are needed for calculating stats for indexes/primary key/extended stats.",
))
rows := tk.MustQuery("show column_stats_usage where db_name = 'test' and table_name = 't' and last_analyzed_at is not null").Sort().Rows()
c.Assert(len(rows), Equals, 4)
c.Assert(rows[0][:4], DeepEquals, []interface{}{"test", "t", "p0", "a"})
@ -3440,7 +3458,10 @@ func (s *testStatsSuite) TestAnalyzeColumnsWithExtendedStats(c *C) {
tblID := tbl.Meta().ID
tk.MustExec("analyze table t columns b with 2 topn, 2 buckets")
tk.MustQuery("show warnings").Check(testkit.Rows("Warning 1105 Columns c are missing in ANALYZE but their stats are needed for calculating stats for indexes/primary key/extended stats."))
tk.MustQuery("show warnings").Sort().Check(testkit.Rows(
"Note 1105 Analyze use auto adjusted sample rate 1.000000 for table test.t.",
"Warning 1105 Columns c are missing in ANALYZE but their stats are needed for calculating stats for indexes/primary key/extended stats.",
))
rows := tk.MustQuery("show column_stats_usage where db_name = 'test' and table_name = 't' and last_analyzed_at is not null").Sort().Rows()
c.Assert(len(rows), Equals, 2)
c.Assert(rows[0][3], Equals, "b")
@ -3486,7 +3507,10 @@ func (s *testStatsSuite) TestAnalyzeColumnsWithVirtualColumnIndex(c *C) {
tblID := tbl.Meta().ID
tk.MustExec("analyze table t columns b with 2 topn, 2 buckets")
tk.MustQuery("show warnings").Check(testkit.Rows("Warning 1105 Columns c are missing in ANALYZE but their stats are needed for calculating stats for indexes/primary key/extended stats."))
tk.MustQuery("show warnings").Sort().Check(testkit.Rows(
"Note 1105 Analyze use auto adjusted sample rate 1.000000 for table test.t.",
"Warning 1105 Columns c are missing in ANALYZE but their stats are needed for calculating stats for indexes/primary key/extended stats.",
))
// virtual column c is skipped when dumping stats into disk, so only the stats of column b are updated
rows := tk.MustQuery("show column_stats_usage where db_name = 'test' and table_name = 't' and last_analyzed_at is not null").Rows()
c.Assert(len(rows), Equals, 1)