executor: also use approximate count from pd to decide the sample rate (#29230)
This commit is contained in:
@ -1132,3 +1132,30 @@ func (s *testSuite10) TestSnapshotAnalyze(c *C) {
|
||||
c.Assert(s3Str, Equals, s2Str)
|
||||
c.Assert(failpoint.Disable("github.com/pingcap/tidb/executor/injectAnalyzeSnapshot"), IsNil)
|
||||
}
|
||||
|
||||
func (s *testSuite10) TestAdjustSampleRateNote(c *C) {
|
||||
tk := testkit.NewTestKit(c, s.store)
|
||||
tk.MustExec("use test")
|
||||
statsHandle := domain.GetDomain(tk.Se.(sessionctx.Context)).StatsHandle()
|
||||
tk.MustExec("drop table if exists t")
|
||||
tk.MustExec("create table t(a int, index index_a(a))")
|
||||
c.Assert(statsHandle.HandleDDLEvent(<-statsHandle.DDLEventCh()), IsNil)
|
||||
is := tk.Se.(sessionctx.Context).GetInfoSchema().(infoschema.InfoSchema)
|
||||
tbl, err := is.TableByName(model.NewCIStr("test"), model.NewCIStr("t"))
|
||||
c.Assert(err, IsNil)
|
||||
tblInfo := tbl.Meta()
|
||||
tid := tblInfo.ID
|
||||
tk.MustExec(fmt.Sprintf("update mysql.stats_meta set count = 220000 where table_id=%d", tid))
|
||||
c.Assert(statsHandle.Update(is), IsNil)
|
||||
result := tk.MustQuery("show stats_meta where table_name = 't'")
|
||||
c.Assert(result.Rows()[0][5], Equals, "220000")
|
||||
tk.MustExec("analyze table t")
|
||||
tk.MustQuery("show warnings").Check(testkit.Rows("Note 1105 Analyze use auto adjusted sample rate 0.500000 for table test.t."))
|
||||
tk.MustExec("insert into t values(1),(1),(1)")
|
||||
c.Assert(statsHandle.DumpStatsDeltaToKV(handle.DumpAll), IsNil)
|
||||
c.Assert(statsHandle.Update(is), IsNil)
|
||||
result = tk.MustQuery("show stats_meta where table_name = 't'")
|
||||
c.Assert(result.Rows()[0][5], Equals, "3")
|
||||
tk.MustExec("analyze table t")
|
||||
tk.MustQuery("show warnings").Check(testkit.Rows("Note 1105 Analyze use auto adjusted sample rate 1.000000 for table test.t."))
|
||||
}
|
||||
|
||||
@ -45,6 +45,7 @@ import (
|
||||
"github.com/pingcap/tidb/sessionctx"
|
||||
"github.com/pingcap/tidb/sessionctx/stmtctx"
|
||||
"github.com/pingcap/tidb/statistics"
|
||||
"github.com/pingcap/tidb/store/helper"
|
||||
"github.com/pingcap/tidb/table"
|
||||
"github.com/pingcap/tidb/table/tables"
|
||||
"github.com/pingcap/tidb/table/temptable"
|
||||
@ -2225,6 +2226,22 @@ func (b *executorBuilder) buildAnalyzeSamplingPushdown(task plannercore.AnalyzeC
|
||||
*sampleRate = math.Float64frombits(opts[ast.AnalyzeOptSampleRate])
|
||||
if *sampleRate < 0 {
|
||||
*sampleRate = b.getAdjustedSampleRate(b.ctx, task.TableID.GetStatisticsID(), task.TblInfo)
|
||||
if task.PartitionName != "" {
|
||||
sc.AppendNote(errors.Errorf(
|
||||
"Analyze use auto adjusted sample rate %f for table %s.%s's partition %s.",
|
||||
*sampleRate,
|
||||
task.DBName,
|
||||
task.TableName,
|
||||
task.PartitionName,
|
||||
))
|
||||
} else {
|
||||
sc.AppendNote(errors.Errorf(
|
||||
"Analyze use auto adjusted sample rate %f for table %s.%s.",
|
||||
*sampleRate,
|
||||
task.DBName,
|
||||
task.TableName,
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
e.analyzePB.ColReq = &tipb.AnalyzeColumnsReq{
|
||||
@ -2245,6 +2262,7 @@ func (b *executorBuilder) buildAnalyzeSamplingPushdown(task plannercore.AnalyzeC
|
||||
return &analyzeTask{taskType: colTask, colExec: e, job: job}
|
||||
}
|
||||
|
||||
// getAdjustedSampleRate calculate the sample rate by the table size. If we cannot get the table size. We use the 0.001 as the default sample rate.
|
||||
func (b *executorBuilder) getAdjustedSampleRate(sctx sessionctx.Context, tid int64, tblInfo *model.TableInfo) float64 {
|
||||
statsHandle := domain.GetDomain(sctx).StatsHandle()
|
||||
defaultRate := 0.001
|
||||
@ -2257,17 +2275,47 @@ func (b *executorBuilder) getAdjustedSampleRate(sctx sessionctx.Context, tid int
|
||||
} else {
|
||||
statsTbl = statsHandle.GetPartitionStats(tblInfo, tid)
|
||||
}
|
||||
if statsTbl == nil {
|
||||
approxiCount, hasPD := b.getApproximateTableCountFromPD(sctx, tid)
|
||||
// If there's no stats meta and no pd, return the default rate.
|
||||
if statsTbl == nil && !hasPD {
|
||||
return defaultRate
|
||||
}
|
||||
// If the count in stats_meta is still 0, the table is not large, we scan all rows.
|
||||
// If the count in stats_meta is still 0 and there's no information from pd side, we scan all rows.
|
||||
if statsTbl.Count == 0 && !hasPD {
|
||||
return 1
|
||||
}
|
||||
// we have issue https://github.com/pingcap/tidb/issues/29216.
|
||||
// To do a workaround for this issue, we check the approxiCount from the pd side to do a comparison.
|
||||
// If the count from the stats_meta is extremely smaller than the approximate count from the pd,
|
||||
// we think that we meet this issue and use the approximate count to calculate the sample rate.
|
||||
if float64(statsTbl.Count*100) < approxiCount {
|
||||
// Confirmed by TiKV side, the experience error rate of the approximate count is about 20%.
|
||||
// So we increase the number to 150000 to reduce this error rate.
|
||||
return math.Min(1, 150000/approxiCount)
|
||||
}
|
||||
// If we don't go into the above if branch and we still detect the count is zero. Return 1 to prevent the dividing zero.
|
||||
if statsTbl.Count == 0 {
|
||||
return 1
|
||||
}
|
||||
// We are expected to scan about 100000 rows or so.
|
||||
// Since there's tiny error rate around the count from the stats meta, we use 110000 to get a little big result
|
||||
return math.Min(1, 110000/float64(statsTbl.Count))
|
||||
}
|
||||
|
||||
func (b *executorBuilder) getApproximateTableCountFromPD(sctx sessionctx.Context, tid int64) (float64, bool) {
|
||||
tikvStore, ok := sctx.GetStore().(helper.Storage)
|
||||
if !ok {
|
||||
return 0, false
|
||||
}
|
||||
regionStats := &helper.PDRegionStats{}
|
||||
pdHelper := helper.NewHelper(tikvStore)
|
||||
err := pdHelper.GetPDRegionStats(tid, regionStats)
|
||||
if err != nil {
|
||||
return 0, false
|
||||
}
|
||||
return float64(regionStats.StorageKeys), true
|
||||
}
|
||||
|
||||
func (b *executorBuilder) buildAnalyzeColumnsPushdown(task plannercore.AnalyzeColumnsTask, opts map[ast.AnalyzeOptionType]uint64, autoAnalyze string, schemaForVirtualColEval *expression.Schema) *analyzeTask {
|
||||
if task.StatsVersion == statistics.Version2 {
|
||||
return b.buildAnalyzeSamplingPushdown(task, opts, autoAnalyze, schemaForVirtualColEval)
|
||||
|
||||
@ -587,7 +587,7 @@ func (s *testInfoschemaTableSuite) TestForAnalyzeStatus(c *C) {
|
||||
tk.MustExec("create table t1 (a int, b int, index idx(a))")
|
||||
tk.MustExec("insert into t1 values (1,2),(3,4)")
|
||||
tk.MustExec("analyze table t1")
|
||||
tk.MustQuery("show warnings").Check(testkit.Rows()) // no warning
|
||||
tk.MustQuery("show warnings").Check(testkit.Rows("Note 1105 Analyze use auto adjusted sample rate 1.000000 for table test.t1.")) // 1 note.
|
||||
c.Assert(s.dom.StatsHandle().LoadNeededHistograms(), IsNil)
|
||||
tk.MustExec("CREATE ROLE r_t1 ;")
|
||||
tk.MustExec("GRANT ALL PRIVILEGES ON test.t1 TO r_t1;")
|
||||
|
||||
@ -3997,9 +3997,10 @@ func (s *testIntegrationSuite) TestIncrementalAnalyzeStatsVer2(c *C) {
|
||||
c.Assert(rows[0][0], Equals, "3")
|
||||
tk.MustExec("insert into t values(4,4),(5,5),(6,6)")
|
||||
tk.MustExec("analyze incremental table t index idx_b")
|
||||
c.Assert(tk.Se.GetSessionVars().StmtCtx.GetWarnings(), HasLen, 2)
|
||||
c.Assert(tk.Se.GetSessionVars().StmtCtx.GetWarnings(), HasLen, 3)
|
||||
c.Assert(tk.Se.GetSessionVars().StmtCtx.GetWarnings()[0].Err.Error(), Equals, "The version 2 would collect all statistics not only the selected indexes")
|
||||
c.Assert(tk.Se.GetSessionVars().StmtCtx.GetWarnings()[1].Err.Error(), Equals, "The version 2 stats would ignore the INCREMENTAL keyword and do full sampling")
|
||||
c.Assert(tk.Se.GetSessionVars().StmtCtx.GetWarnings()[2].Err.Error(), Equals, "Analyze use auto adjusted sample rate 1.000000 for table test.t.")
|
||||
rows = tk.MustQuery(fmt.Sprintf("select distinct_count from mysql.stats_histograms where table_id = %d and is_index = 1", tblID)).Rows()
|
||||
c.Assert(len(rows), Equals, 1)
|
||||
c.Assert(rows[0][0], Equals, "6")
|
||||
|
||||
@ -2177,6 +2177,7 @@ func (s *testStatsSuite) TestFMSWithAnalyzePartition(c *C) {
|
||||
tk.MustQuery("select count(*) from mysql.stats_fm_sketch").Check(testkit.Rows("0"))
|
||||
tk.MustExec("analyze table t partition p0 with 1 topn, 2 buckets")
|
||||
tk.MustQuery("show warnings").Sort().Check(testkit.Rows(
|
||||
"Note 1105 Analyze use auto adjusted sample rate 1.000000 for table test.t's partition p0.",
|
||||
"Warning 8131 Build table: `t` global-level stats failed due to missing partition-level stats",
|
||||
"Warning 8131 Build table: `t` index: `a` global-level stats failed due to missing partition-level stats",
|
||||
))
|
||||
@ -3093,7 +3094,10 @@ func (s *testStatsSuite) TestAnalyzeColumnsWithPrimaryKey(c *C) {
|
||||
tblID := tbl.Meta().ID
|
||||
|
||||
tk.MustExec("analyze table t columns a with 2 topn, 2 buckets")
|
||||
tk.MustQuery("show warnings").Check(testkit.Rows("Warning 1105 Columns c are missing in ANALYZE but their stats are needed for calculating stats for indexes/primary key/extended stats."))
|
||||
tk.MustQuery("show warnings").Sort().Check(testkit.Rows(
|
||||
"Note 1105 Analyze use auto adjusted sample rate 1.000000 for table test.t.",
|
||||
"Warning 1105 Columns c are missing in ANALYZE but their stats are needed for calculating stats for indexes/primary key/extended stats.",
|
||||
))
|
||||
rows := tk.MustQuery("show column_stats_usage where db_name = 'test' and table_name = 't' and last_analyzed_at is not null").Sort().Rows()
|
||||
c.Assert(len(rows), Equals, 2)
|
||||
c.Assert(rows[0][3], Equals, "a")
|
||||
@ -3136,7 +3140,10 @@ func (s *testStatsSuite) TestAnalyzeColumnsWithIndex(c *C) {
|
||||
tblID := tbl.Meta().ID
|
||||
|
||||
tk.MustExec("analyze table t columns c with 2 topn, 2 buckets")
|
||||
tk.MustQuery("show warnings").Check(testkit.Rows("Warning 1105 Columns b,d are missing in ANALYZE but their stats are needed for calculating stats for indexes/primary key/extended stats."))
|
||||
tk.MustQuery("show warnings").Sort().Check(testkit.Rows(
|
||||
"Note 1105 Analyze use auto adjusted sample rate 1.000000 for table test.t.",
|
||||
"Warning 1105 Columns b,d are missing in ANALYZE but their stats are needed for calculating stats for indexes/primary key/extended stats.",
|
||||
))
|
||||
rows := tk.MustQuery("show column_stats_usage where db_name = 'test' and table_name = 't' and last_analyzed_at is not null").Sort().Rows()
|
||||
c.Assert(len(rows), Equals, 3)
|
||||
c.Assert(rows[0][3], Equals, "b")
|
||||
@ -3188,7 +3195,10 @@ func (s *testStatsSuite) TestAnalyzeColumnsWithClusteredIndex(c *C) {
|
||||
tblID := tbl.Meta().ID
|
||||
|
||||
tk.MustExec("analyze table t columns c with 2 topn, 2 buckets")
|
||||
tk.MustQuery("show warnings").Check(testkit.Rows("Warning 1105 Columns b,d are missing in ANALYZE but their stats are needed for calculating stats for indexes/primary key/extended stats."))
|
||||
tk.MustQuery("show warnings").Sort().Check(testkit.Rows(
|
||||
"Note 1105 Analyze use auto adjusted sample rate 1.000000 for table test.t.",
|
||||
"Warning 1105 Columns b,d are missing in ANALYZE but their stats are needed for calculating stats for indexes/primary key/extended stats.",
|
||||
))
|
||||
rows := tk.MustQuery("show column_stats_usage where db_name = 'test' and table_name = 't' and last_analyzed_at is not null").Sort().Rows()
|
||||
c.Assert(len(rows), Equals, 3)
|
||||
c.Assert(rows[0][3], Equals, "b")
|
||||
@ -3263,7 +3273,11 @@ func (s *testStatsSuite) TestAnalyzeColumnsWithDynamicPartitionTable(c *C) {
|
||||
p1ID := defs[1].ID
|
||||
|
||||
tk.MustExec("analyze table t columns a with 2 topn, 2 buckets")
|
||||
tk.MustQuery("show warnings").Check(testkit.Rows("Warning 1105 Columns c are missing in ANALYZE but their stats are needed for calculating stats for indexes/primary key/extended stats."))
|
||||
tk.MustQuery("show warnings").Sort().Check(testkit.Rows(
|
||||
"Note 1105 Analyze use auto adjusted sample rate 1.000000 for table test.t's partition p0.",
|
||||
"Note 1105 Analyze use auto adjusted sample rate 1.000000 for table test.t's partition p1.",
|
||||
"Warning 1105 Columns c are missing in ANALYZE but their stats are needed for calculating stats for indexes/primary key/extended stats.",
|
||||
))
|
||||
rows := tk.MustQuery("show column_stats_usage where db_name = 'test' and table_name = 't' and last_analyzed_at is not null").Sort().Rows()
|
||||
c.Assert(len(rows), Equals, 6)
|
||||
c.Assert(rows[0][:4], DeepEquals, []interface{}{"test", "t", "global", "a"})
|
||||
@ -3361,7 +3375,11 @@ func (s *testStatsSuite) TestAnalyzeColumnsWithStaticPartitionTable(c *C) {
|
||||
p1ID := defs[1].ID
|
||||
|
||||
tk.MustExec("analyze table t columns a with 2 topn, 2 buckets")
|
||||
tk.MustQuery("show warnings").Check(testkit.Rows("Warning 1105 Columns c are missing in ANALYZE but their stats are needed for calculating stats for indexes/primary key/extended stats."))
|
||||
tk.MustQuery("show warnings").Sort().Check(testkit.Rows(
|
||||
"Note 1105 Analyze use auto adjusted sample rate 1.000000 for table test.t's partition p0.",
|
||||
"Note 1105 Analyze use auto adjusted sample rate 1.000000 for table test.t's partition p1.",
|
||||
"Warning 1105 Columns c are missing in ANALYZE but their stats are needed for calculating stats for indexes/primary key/extended stats.",
|
||||
))
|
||||
rows := tk.MustQuery("show column_stats_usage where db_name = 'test' and table_name = 't' and last_analyzed_at is not null").Sort().Rows()
|
||||
c.Assert(len(rows), Equals, 4)
|
||||
c.Assert(rows[0][:4], DeepEquals, []interface{}{"test", "t", "p0", "a"})
|
||||
@ -3440,7 +3458,10 @@ func (s *testStatsSuite) TestAnalyzeColumnsWithExtendedStats(c *C) {
|
||||
tblID := tbl.Meta().ID
|
||||
|
||||
tk.MustExec("analyze table t columns b with 2 topn, 2 buckets")
|
||||
tk.MustQuery("show warnings").Check(testkit.Rows("Warning 1105 Columns c are missing in ANALYZE but their stats are needed for calculating stats for indexes/primary key/extended stats."))
|
||||
tk.MustQuery("show warnings").Sort().Check(testkit.Rows(
|
||||
"Note 1105 Analyze use auto adjusted sample rate 1.000000 for table test.t.",
|
||||
"Warning 1105 Columns c are missing in ANALYZE but their stats are needed for calculating stats for indexes/primary key/extended stats.",
|
||||
))
|
||||
rows := tk.MustQuery("show column_stats_usage where db_name = 'test' and table_name = 't' and last_analyzed_at is not null").Sort().Rows()
|
||||
c.Assert(len(rows), Equals, 2)
|
||||
c.Assert(rows[0][3], Equals, "b")
|
||||
@ -3486,7 +3507,10 @@ func (s *testStatsSuite) TestAnalyzeColumnsWithVirtualColumnIndex(c *C) {
|
||||
tblID := tbl.Meta().ID
|
||||
|
||||
tk.MustExec("analyze table t columns b with 2 topn, 2 buckets")
|
||||
tk.MustQuery("show warnings").Check(testkit.Rows("Warning 1105 Columns c are missing in ANALYZE but their stats are needed for calculating stats for indexes/primary key/extended stats."))
|
||||
tk.MustQuery("show warnings").Sort().Check(testkit.Rows(
|
||||
"Note 1105 Analyze use auto adjusted sample rate 1.000000 for table test.t.",
|
||||
"Warning 1105 Columns c are missing in ANALYZE but their stats are needed for calculating stats for indexes/primary key/extended stats.",
|
||||
))
|
||||
// virtual column c is skipped when dumping stats into disk, so only the stats of column b are updated
|
||||
rows := tk.MustQuery("show column_stats_usage where db_name = 'test' and table_name = 't' and last_analyzed_at is not null").Rows()
|
||||
c.Assert(len(rows), Equals, 1)
|
||||
|
||||
Reference in New Issue
Block a user