From afa7dfb10ff8a3993aa22bddfbf32632d171f8db Mon Sep 17 00:00:00 2001
From: Yiding Cui <winoros@gmail.com>
Date: Mon, 1 Nov 2021 11:08:51 +0800
Subject: [PATCH] executor: also use approximate count from pd to decide the
 sample rate (#29230)

---
 executor/analyze_test.go           | 27 ++++++++++++++++
 executor/builder.go                | 52 ++++++++++++++++++++++++++++--
 executor/infoschema_reader_test.go |  2 +-
 planner/core/integration_test.go   |  3 +-
 statistics/handle/handle_test.go   | 38 ++++++++++++++++++----
 5 files changed, 111 insertions(+), 11 deletions(-)

diff --git a/executor/analyze_test.go b/executor/analyze_test.go
index a6b67e23ed..32a9c00edd 100644
--- a/executor/analyze_test.go
+++ b/executor/analyze_test.go
@@ -1132,3 +1132,30 @@ func (s *testSuite10) TestSnapshotAnalyze(c *C) {
 	c.Assert(s3Str, Equals, s2Str)
 	c.Assert(failpoint.Disable("github.com/pingcap/tidb/executor/injectAnalyzeSnapshot"), IsNil)
 }
+
+func (s *testSuite10) TestAdjustSampleRateNote(c *C) {
+	tk := testkit.NewTestKit(c, s.store)
+	tk.MustExec("use test")
+	statsHandle := domain.GetDomain(tk.Se.(sessionctx.Context)).StatsHandle()
+	tk.MustExec("drop table if exists t")
+	tk.MustExec("create table t(a int, index index_a(a))")
+	c.Assert(statsHandle.HandleDDLEvent(<-statsHandle.DDLEventCh()), IsNil)
+	is := tk.Se.(sessionctx.Context).GetInfoSchema().(infoschema.InfoSchema)
+	tbl, err := is.TableByName(model.NewCIStr("test"), model.NewCIStr("t"))
+	c.Assert(err, IsNil)
+	tblInfo := tbl.Meta()
+	tid := tblInfo.ID
+	tk.MustExec(fmt.Sprintf("update mysql.stats_meta set count = 220000 where table_id=%d", tid))
+	c.Assert(statsHandle.Update(is), IsNil)
+	result := tk.MustQuery("show stats_meta where table_name = 't'")
+	c.Assert(result.Rows()[0][5], Equals, "220000")
+	tk.MustExec("analyze table t")
+	tk.MustQuery("show warnings").Check(testkit.Rows("Note 1105 Analyze use auto adjusted sample rate 0.500000 for table test.t."))
+	tk.MustExec("insert into t values(1),(1),(1)")
+	c.Assert(statsHandle.DumpStatsDeltaToKV(handle.DumpAll), IsNil)
+	c.Assert(statsHandle.Update(is), IsNil)
+	result = tk.MustQuery("show stats_meta where table_name = 't'")
+	c.Assert(result.Rows()[0][5], Equals, "3")
+	tk.MustExec("analyze table t")
+	tk.MustQuery("show warnings").Check(testkit.Rows("Note 1105 Analyze use auto adjusted sample rate 1.000000 for table test.t."))
+}
diff --git a/executor/builder.go b/executor/builder.go
index 09e288ea52..d064618fce 100644
--- a/executor/builder.go
+++ b/executor/builder.go
@@ -45,6 +45,7 @@ import (
 	"github.com/pingcap/tidb/sessionctx"
 	"github.com/pingcap/tidb/sessionctx/stmtctx"
 	"github.com/pingcap/tidb/statistics"
+	"github.com/pingcap/tidb/store/helper"
 	"github.com/pingcap/tidb/table"
 	"github.com/pingcap/tidb/table/tables"
 	"github.com/pingcap/tidb/table/temptable"
@@ -2225,6 +2226,22 @@ func (b *executorBuilder) buildAnalyzeSamplingPushdown(task plannercore.AnalyzeC
 		*sampleRate = math.Float64frombits(opts[ast.AnalyzeOptSampleRate])
 		if *sampleRate < 0 {
 			*sampleRate = b.getAdjustedSampleRate(b.ctx, task.TableID.GetStatisticsID(), task.TblInfo)
+			if task.PartitionName != "" {
+				sc.AppendNote(errors.Errorf(
+					"Analyze use auto adjusted sample rate %f for table %s.%s's partition %s.",
+					*sampleRate,
+					task.DBName,
+					task.TableName,
+					task.PartitionName,
+				))
+			} else {
+				sc.AppendNote(errors.Errorf(
+					"Analyze use auto adjusted sample rate %f for table %s.%s.",
+					*sampleRate,
+					task.DBName,
+					task.TableName,
+				))
+			}
 		}
 	}
 	e.analyzePB.ColReq = &tipb.AnalyzeColumnsReq{
@@ -2245,6 +2262,7 @@ func (b *executorBuilder) buildAnalyzeSamplingPushdown(task plannercore.AnalyzeC
 	return &analyzeTask{taskType: colTask, colExec: e, job: job}
 }
 
+// getAdjustedSampleRate calculate the sample rate by the table size. If we cannot get the table size. We use the 0.001 as the default sample rate.
 func (b *executorBuilder) getAdjustedSampleRate(sctx sessionctx.Context, tid int64, tblInfo *model.TableInfo) float64 {
 	statsHandle := domain.GetDomain(sctx).StatsHandle()
 	defaultRate := 0.001
@@ -2257,17 +2275,47 @@ func (b *executorBuilder) getAdjustedSampleRate(sctx sessionctx.Context, tid int
 	} else {
 		statsTbl = statsHandle.GetPartitionStats(tblInfo, tid)
 	}
-	if statsTbl == nil {
+	approxiCount, hasPD := b.getApproximateTableCountFromPD(sctx, tid)
+	// If there's no stats meta and no pd, return the default rate.
+	if statsTbl == nil && !hasPD {
 		return defaultRate
 	}
-	// If the count in stats_meta is still 0, the table is not large, we scan all rows.
+	// If the count in stats_meta is still 0 and there's no information from pd side, we scan all rows.
+	if statsTbl.Count == 0 && !hasPD {
+		return 1
+	}
+	// we have issue https://github.com/pingcap/tidb/issues/29216.
+	// To do a workaround for this issue, we check the approxiCount from the pd side to do a comparison.
+	// If the count from the stats_meta is extremely smaller than the approximate count from the pd,
+	// we think that we meet this issue and use the approximate count to calculate the sample rate.
+	if float64(statsTbl.Count*100) < approxiCount {
+		// Confirmed by TiKV side, the experience error rate of the approximate count is about 20%.
+		// So we increase the number to 150000 to reduce this error rate.
+		return math.Min(1, 150000/approxiCount)
+	}
+	// If we don't go into the above if branch and we still detect the count is zero. Return 1 to prevent the dividing zero.
 	if statsTbl.Count == 0 {
 		return 1
 	}
 	// We are expected to scan about 100000 rows or so.
+	// Since there's tiny error rate around the count from the stats meta, we use 110000 to get a little big result
 	return math.Min(1, 110000/float64(statsTbl.Count))
 }
 
+func (b *executorBuilder) getApproximateTableCountFromPD(sctx sessionctx.Context, tid int64) (float64, bool) {
+	tikvStore, ok := sctx.GetStore().(helper.Storage)
+	if !ok {
+		return 0, false
+	}
+	regionStats := &helper.PDRegionStats{}
+	pdHelper := helper.NewHelper(tikvStore)
+	err := pdHelper.GetPDRegionStats(tid, regionStats)
+	if err != nil {
+		return 0, false
+	}
+	return float64(regionStats.StorageKeys), true
+}
+
 func (b *executorBuilder) buildAnalyzeColumnsPushdown(task plannercore.AnalyzeColumnsTask, opts map[ast.AnalyzeOptionType]uint64, autoAnalyze string, schemaForVirtualColEval *expression.Schema) *analyzeTask {
 	if task.StatsVersion == statistics.Version2 {
 		return b.buildAnalyzeSamplingPushdown(task, opts, autoAnalyze, schemaForVirtualColEval)
diff --git a/executor/infoschema_reader_test.go b/executor/infoschema_reader_test.go
index 5ec96107ca..d2b26a10dd 100644
--- a/executor/infoschema_reader_test.go
+++ b/executor/infoschema_reader_test.go
@@ -587,7 +587,7 @@ func (s *testInfoschemaTableSuite) TestForAnalyzeStatus(c *C) {
 	tk.MustExec("create table t1 (a int, b int, index idx(a))")
 	tk.MustExec("insert into t1 values (1,2),(3,4)")
 	tk.MustExec("analyze table t1")
-	tk.MustQuery("show warnings").Check(testkit.Rows()) // no warning
+	tk.MustQuery("show warnings").Check(testkit.Rows("Note 1105 Analyze use auto adjusted sample rate 1.000000 for table test.t1.")) // 1 note.
 	c.Assert(s.dom.StatsHandle().LoadNeededHistograms(), IsNil)
 	tk.MustExec("CREATE ROLE r_t1 ;")
 	tk.MustExec("GRANT ALL PRIVILEGES ON test.t1 TO r_t1;")
diff --git a/planner/core/integration_test.go b/planner/core/integration_test.go
index c5ae612bbc..bb9d88756f 100644
--- a/planner/core/integration_test.go
+++ b/planner/core/integration_test.go
@@ -3997,9 +3997,10 @@ func (s *testIntegrationSuite) TestIncrementalAnalyzeStatsVer2(c *C) {
 	c.Assert(rows[0][0], Equals, "3")
 	tk.MustExec("insert into t values(4,4),(5,5),(6,6)")
 	tk.MustExec("analyze incremental table t index idx_b")
-	c.Assert(tk.Se.GetSessionVars().StmtCtx.GetWarnings(), HasLen, 2)
+	c.Assert(tk.Se.GetSessionVars().StmtCtx.GetWarnings(), HasLen, 3)
 	c.Assert(tk.Se.GetSessionVars().StmtCtx.GetWarnings()[0].Err.Error(), Equals, "The version 2 would collect all statistics not only the selected indexes")
 	c.Assert(tk.Se.GetSessionVars().StmtCtx.GetWarnings()[1].Err.Error(), Equals, "The version 2 stats would ignore the INCREMENTAL keyword and do full sampling")
+	c.Assert(tk.Se.GetSessionVars().StmtCtx.GetWarnings()[2].Err.Error(), Equals, "Analyze use auto adjusted sample rate 1.000000 for table test.t.")
 	rows = tk.MustQuery(fmt.Sprintf("select distinct_count from mysql.stats_histograms where table_id = %d and is_index = 1", tblID)).Rows()
 	c.Assert(len(rows), Equals, 1)
 	c.Assert(rows[0][0], Equals, "6")
diff --git a/statistics/handle/handle_test.go b/statistics/handle/handle_test.go
index c85f494232..78c6415eec 100644
--- a/statistics/handle/handle_test.go
+++ b/statistics/handle/handle_test.go
@@ -2177,6 +2177,7 @@ func (s *testStatsSuite) TestFMSWithAnalyzePartition(c *C) {
 	tk.MustQuery("select count(*) from mysql.stats_fm_sketch").Check(testkit.Rows("0"))
 	tk.MustExec("analyze table t partition p0 with 1 topn, 2 buckets")
 	tk.MustQuery("show warnings").Sort().Check(testkit.Rows(
+		"Note 1105 Analyze use auto adjusted sample rate 1.000000 for table test.t's partition p0.",
 		"Warning 8131 Build table: `t` global-level stats failed due to missing partition-level stats",
 		"Warning 8131 Build table: `t` index: `a` global-level stats failed due to missing partition-level stats",
 	))
@@ -3093,7 +3094,10 @@ func (s *testStatsSuite) TestAnalyzeColumnsWithPrimaryKey(c *C) {
 	tblID := tbl.Meta().ID
 
 	tk.MustExec("analyze table t columns a with 2 topn, 2 buckets")
-	tk.MustQuery("show warnings").Check(testkit.Rows("Warning 1105 Columns c are missing in ANALYZE but their stats are needed for calculating stats for indexes/primary key/extended stats."))
+	tk.MustQuery("show warnings").Sort().Check(testkit.Rows(
+		"Note 1105 Analyze use auto adjusted sample rate 1.000000 for table test.t.",
+		"Warning 1105 Columns c are missing in ANALYZE but their stats are needed for calculating stats for indexes/primary key/extended stats.",
+	))
 	rows := tk.MustQuery("show column_stats_usage where db_name = 'test' and table_name = 't' and last_analyzed_at is not null").Sort().Rows()
 	c.Assert(len(rows), Equals, 2)
 	c.Assert(rows[0][3], Equals, "a")
@@ -3136,7 +3140,10 @@ func (s *testStatsSuite) TestAnalyzeColumnsWithIndex(c *C) {
 	tblID := tbl.Meta().ID
 
 	tk.MustExec("analyze table t columns c with 2 topn, 2 buckets")
-	tk.MustQuery("show warnings").Check(testkit.Rows("Warning 1105 Columns b,d are missing in ANALYZE but their stats are needed for calculating stats for indexes/primary key/extended stats."))
+	tk.MustQuery("show warnings").Sort().Check(testkit.Rows(
+		"Note 1105 Analyze use auto adjusted sample rate 1.000000 for table test.t.",
+		"Warning 1105 Columns b,d are missing in ANALYZE but their stats are needed for calculating stats for indexes/primary key/extended stats.",
+	))
 	rows := tk.MustQuery("show column_stats_usage where db_name = 'test' and table_name = 't' and last_analyzed_at is not null").Sort().Rows()
 	c.Assert(len(rows), Equals, 3)
 	c.Assert(rows[0][3], Equals, "b")
@@ -3188,7 +3195,10 @@ func (s *testStatsSuite) TestAnalyzeColumnsWithClusteredIndex(c *C) {
 	tblID := tbl.Meta().ID
 
 	tk.MustExec("analyze table t columns c with 2 topn, 2 buckets")
-	tk.MustQuery("show warnings").Check(testkit.Rows("Warning 1105 Columns b,d are missing in ANALYZE but their stats are needed for calculating stats for indexes/primary key/extended stats."))
+	tk.MustQuery("show warnings").Sort().Check(testkit.Rows(
+		"Note 1105 Analyze use auto adjusted sample rate 1.000000 for table test.t.",
+		"Warning 1105 Columns b,d are missing in ANALYZE but their stats are needed for calculating stats for indexes/primary key/extended stats.",
+	))
 	rows := tk.MustQuery("show column_stats_usage where db_name = 'test' and table_name = 't' and last_analyzed_at is not null").Sort().Rows()
 	c.Assert(len(rows), Equals, 3)
 	c.Assert(rows[0][3], Equals, "b")
@@ -3263,7 +3273,11 @@ func (s *testStatsSuite) TestAnalyzeColumnsWithDynamicPartitionTable(c *C) {
 	p1ID := defs[1].ID
 
 	tk.MustExec("analyze table t columns a with 2 topn, 2 buckets")
-	tk.MustQuery("show warnings").Check(testkit.Rows("Warning 1105 Columns c are missing in ANALYZE but their stats are needed for calculating stats for indexes/primary key/extended stats."))
+	tk.MustQuery("show warnings").Sort().Check(testkit.Rows(
+		"Note 1105 Analyze use auto adjusted sample rate 1.000000 for table test.t's partition p0.",
+		"Note 1105 Analyze use auto adjusted sample rate 1.000000 for table test.t's partition p1.",
+		"Warning 1105 Columns c are missing in ANALYZE but their stats are needed for calculating stats for indexes/primary key/extended stats.",
+	))
 	rows := tk.MustQuery("show column_stats_usage where db_name = 'test' and table_name = 't' and last_analyzed_at is not null").Sort().Rows()
 	c.Assert(len(rows), Equals, 6)
 	c.Assert(rows[0][:4], DeepEquals, []interface{}{"test", "t", "global", "a"})
@@ -3361,7 +3375,11 @@ func (s *testStatsSuite) TestAnalyzeColumnsWithStaticPartitionTable(c *C) {
 	p1ID := defs[1].ID
 
 	tk.MustExec("analyze table t columns a with 2 topn, 2 buckets")
-	tk.MustQuery("show warnings").Check(testkit.Rows("Warning 1105 Columns c are missing in ANALYZE but their stats are needed for calculating stats for indexes/primary key/extended stats."))
+	tk.MustQuery("show warnings").Sort().Check(testkit.Rows(
+		"Note 1105 Analyze use auto adjusted sample rate 1.000000 for table test.t's partition p0.",
+		"Note 1105 Analyze use auto adjusted sample rate 1.000000 for table test.t's partition p1.",
+		"Warning 1105 Columns c are missing in ANALYZE but their stats are needed for calculating stats for indexes/primary key/extended stats.",
+	))
 	rows := tk.MustQuery("show column_stats_usage where db_name = 'test' and table_name = 't' and last_analyzed_at is not null").Sort().Rows()
 	c.Assert(len(rows), Equals, 4)
 	c.Assert(rows[0][:4], DeepEquals, []interface{}{"test", "t", "p0", "a"})
@@ -3440,7 +3458,10 @@ func (s *testStatsSuite) TestAnalyzeColumnsWithExtendedStats(c *C) {
 	tblID := tbl.Meta().ID
 
 	tk.MustExec("analyze table t columns b with 2 topn, 2 buckets")
-	tk.MustQuery("show warnings").Check(testkit.Rows("Warning 1105 Columns c are missing in ANALYZE but their stats are needed for calculating stats for indexes/primary key/extended stats."))
+	tk.MustQuery("show warnings").Sort().Check(testkit.Rows(
+		"Note 1105 Analyze use auto adjusted sample rate 1.000000 for table test.t.",
+		"Warning 1105 Columns c are missing in ANALYZE but their stats are needed for calculating stats for indexes/primary key/extended stats.",
+	))
 	rows := tk.MustQuery("show column_stats_usage where db_name = 'test' and table_name = 't' and last_analyzed_at is not null").Sort().Rows()
 	c.Assert(len(rows), Equals, 2)
 	c.Assert(rows[0][3], Equals, "b")
@@ -3486,7 +3507,10 @@ func (s *testStatsSuite) TestAnalyzeColumnsWithVirtualColumnIndex(c *C) {
 	tblID := tbl.Meta().ID
 
 	tk.MustExec("analyze table t columns b with 2 topn, 2 buckets")
-	tk.MustQuery("show warnings").Check(testkit.Rows("Warning 1105 Columns c are missing in ANALYZE but their stats are needed for calculating stats for indexes/primary key/extended stats."))
+	tk.MustQuery("show warnings").Sort().Check(testkit.Rows(
+		"Note 1105 Analyze use auto adjusted sample rate 1.000000 for table test.t.",
+		"Warning 1105 Columns c are missing in ANALYZE but their stats are needed for calculating stats for indexes/primary key/extended stats.",
+	))
 	// virtual column c is skipped when dumping stats into disk, so only the stats of column b are updated
 	rows := tk.MustQuery("show column_stats_usage where db_name = 'test' and table_name = 't' and last_analyzed_at is not null").Rows()
 	c.Assert(len(rows), Equals, 1)