From d4a50aa47b6ba575aa0ef763caac5a72ef4fec79 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BA=8C=E6=89=8B=E6=8E=89=E5=8C=85=E5=B7=A5=E7=A8=8B?=
 =?UTF-8?q?=E5=B8=88?= <rustin.liu@gmail.com>
Date: Mon, 14 Aug 2023 16:38:00 +0800
Subject: [PATCH] planner: refactor `getFullAnalyzeColumnsInfo` (#46044)

---
 executor/test/analyzetest/analyze_test.go |   4 +-
 planner/core/planbuilder.go               | 152 ++++++++++++++--------
 planner/core/planbuilder_test.go          |  22 +++-
 3 files changed, 117 insertions(+), 61 deletions(-)

diff --git a/executor/test/analyzetest/analyze_test.go b/executor/test/analyzetest/analyze_test.go
index 3f527be850..379f437bec 100644
--- a/executor/test/analyzetest/analyze_test.go
+++ b/executor/test/analyzetest/analyze_test.go
@@ -2126,9 +2126,9 @@ func TestAnalyzeColumnsErrorAndWarning(t *testing.T) {
 	// analyze version 1 doesn't support `ANALYZE COLUMNS c1, ..., cn`/`ANALYZE PREDICATE COLUMNS` currently
 	tk.MustExec("set @@tidb_analyze_version = 1")
 	err := tk.ExecToErr("analyze table t columns a")
-	require.Equal(t, "Only the analyze version 2 supports analyzing the specified columns", err.Error())
+	require.Equal(t, "Only the version 2 of analyze supports analyzing the specified columns", err.Error())
 	err = tk.ExecToErr("analyze table t predicate columns")
-	require.Equal(t, "Only the analyze version 2 supports analyzing predicate columns", err.Error())
+	require.Equal(t, "Only the version 2 of analyze supports analyzing predicate columns", err.Error())
 
 	tk.MustExec("set @@tidb_analyze_version = 2")
 	// invalid column
diff --git a/planner/core/planbuilder.go b/planner/core/planbuilder.go
index 50d1e30ba2..9b5e90e4ad 100644
--- a/planner/core/planbuilder.go
+++ b/planner/core/planbuilder.go
@@ -2325,15 +2325,7 @@ func (b *PlanBuilder) getFullAnalyzeColumnsInfo(
 	if mustAllColumns && warning && (columnChoice == model.PredicateColumns || columnChoice == model.ColumnList) {
 		b.ctx.GetSessionVars().StmtCtx.AppendWarning(errors.Errorf("Table %s.%s has version 1 statistics so all the columns must be analyzed to overwrite the current statistics", tbl.Schema.L, tbl.Name.L))
 	}
-	colSet2colList := func(colSet map[int64]struct{}) []*model.ColumnInfo {
-		colList := make([]*model.ColumnInfo, 0, len(colSet))
-		for _, colInfo := range tbl.TableInfo.Columns {
-			if _, ok := colSet[colInfo.ID]; ok {
-				colList = append(colList, colInfo)
-			}
-		}
-		return colList
-	}
+
 	switch columnChoice {
 	case model.DefaultChoice, model.AllColumns:
 		return tbl.TableInfo.Columns, nil, nil
@@ -2349,52 +2341,86 @@ func (b *PlanBuilder) getFullAnalyzeColumnsInfo(
 		if err != nil {
 			return nil, nil, err
 		}
-		colSet := make(map[int64]struct{}, len(predicate)+len(mustAnalyzed))
-		for colID := range predicate {
-			colSet[colID] = struct{}{}
-		}
-		for colID := range mustAnalyzed {
-			colSet[colID] = struct{}{}
-		}
-		return colSet2colList(colSet), nil, nil
+		colSet := combineColumnSets(predicate, mustAnalyzed)
+		return getColumnListFromSet(tbl.TableInfo.Columns, colSet), nil, nil
 	case model.ColumnList:
-		colSet := make(map[int64]struct{}, len(specifiedCols))
-		for _, colInfo := range specifiedCols {
-			colSet[colInfo.ID] = struct{}{}
-		}
+		colSet := getColumnSetFromSpecifiedCols(specifiedCols)
 		mustAnalyzed, err := b.getMustAnalyzedColumns(tbl, mustAnalyzedCols)
 		if err != nil {
 			return nil, nil, err
 		}
 		if warning {
-			missing := make(map[int64]struct{}, len(mustAnalyzed))
-			for colID := range mustAnalyzed {
-				if _, ok := colSet[colID]; !ok {
-					missing[colID] = struct{}{}
-				}
-			}
+			missing := getMissingColumns(colSet, mustAnalyzed)
 			if len(missing) > 0 {
-				missingNames := make([]string, 0, len(missing))
-				for _, col := range tbl.TableInfo.Columns {
-					if _, ok := missing[col.ID]; ok {
-						missingNames = append(missingNames, col.Name.O)
-					}
-				}
-				b.ctx.GetSessionVars().StmtCtx.AppendWarning(errors.Errorf("Columns %s are missing in ANALYZE but their stats are needed for calculating stats for indexes/primary key/extended stats", strings.Join(missingNames, ",")))
+				missingNames := getColumnNamesFromIDs(tbl.TableInfo.Columns, missing)
+				warningMsg := fmt.Sprintf("Columns %s are missing in ANALYZE but their stats are needed for calculating stats for indexes/primary key/extended stats", strings.Join(missingNames, ","))
+				b.ctx.GetSessionVars().StmtCtx.AppendWarning(errors.New(warningMsg))
 			}
 		}
-		for colID := range mustAnalyzed {
-			colSet[colID] = struct{}{}
-		}
-		colList := colSet2colList(colSet)
+		colSet = combineColumnSets(colSet, mustAnalyzed)
+		colList := getColumnListFromSet(tbl.TableInfo.Columns, colSet)
 		if mustAllColumns {
 			return tbl.TableInfo.Columns, colList, nil
 		}
 		return colList, colList, nil
 	}
+
 	return nil, nil, nil
 }
 
+// Helper function to combine two column sets.
+func combineColumnSets(sets ...map[int64]struct{}) map[int64]struct{} {
+	result := make(map[int64]struct{})
+	for _, set := range sets {
+		for colID := range set {
+			result[colID] = struct{}{}
+		}
+	}
+	return result
+}
+
+// Helper function to extract column IDs from specified columns.
+func getColumnSetFromSpecifiedCols(cols []*model.ColumnInfo) map[int64]struct{} {
+	colSet := make(map[int64]struct{}, len(cols))
+	for _, colInfo := range cols {
+		colSet[colInfo.ID] = struct{}{}
+	}
+	return colSet
+}
+
+// Helper function to get missing columns from a set.
+func getMissingColumns(colSet, mustAnalyzed map[int64]struct{}) map[int64]struct{} {
+	missing := make(map[int64]struct{})
+	for colID := range mustAnalyzed {
+		if _, ok := colSet[colID]; !ok {
+			missing[colID] = struct{}{}
+		}
+	}
+	return missing
+}
+
+// Helper function to get column names from IDs.
+func getColumnNamesFromIDs(columns []*model.ColumnInfo, colIDs map[int64]struct{}) []string {
+	var missingNames []string
+	for _, col := range columns {
+		if _, ok := colIDs[col.ID]; ok {
+			missingNames = append(missingNames, col.Name.O)
+		}
+	}
+	return missingNames
+}
+
+// Helper function to get a list of column infos from a set of column IDs.
+func getColumnListFromSet(columns []*model.ColumnInfo, colSet map[int64]struct{}) []*model.ColumnInfo {
+	colList := make([]*model.ColumnInfo, 0, len(colSet))
+	for _, colInfo := range columns {
+		if _, ok := colSet[colInfo.ID]; ok {
+			colList = append(colList, colInfo)
+		}
+	}
+	return colList
+}
+
 func getColOffsetForAnalyze(colsInfo []*model.ColumnInfo, colID int64) int {
 	for i, col := range colsInfo {
 		if colID == col.ID {
@@ -2438,30 +2464,36 @@ func getModifiedIndexesInfoForAnalyze(sctx sessionctx.Context, tblInfo *model.Ta
 
 func (b *PlanBuilder) buildAnalyzeFullSamplingTask(
 	as *ast.AnalyzeTableStmt,
-	taskSlice []AnalyzeColumnsTask,
+	tasks []AnalyzeColumnsTask,
 	physicalIDs []int64,
-	names []string,
+	partitionNames []string,
 	tbl *ast.TableName,
 	version int,
 	persistOpts bool,
 	rsOptionsMap map[int64]V2AnalyzeOptions,
 ) ([]AnalyzeColumnsTask, error) {
+	// Version 2 doesn't support incremental analyze.
+	// And incremental analyze will be deprecated in the future.
 	if as.Incremental {
 		b.ctx.GetSessionVars().StmtCtx.AppendWarning(errors.Errorf("The version 2 stats would ignore the INCREMENTAL keyword and do full sampling"))
 	}
-	astOpts, err := parseAnalyzeOptionsV2(as.AnalyzeOpts)
+
+	astOpts, err := handleAnalyzeOptionsV2(as.AnalyzeOpts)
 	if err != nil {
 		return nil, err
 	}
+	// Get all column info which need to be analyzed.
 	astColList, err := getAnalyzeColumnList(as.ColumnNames, tbl)
 	if err != nil {
 		return nil, err
 	}
+
 	var predicateCols, mustAnalyzedCols calcOnceMap
 	ver := version
 	statsHandle := domain.GetDomain(b.ctx).StatsHandle()
 	// If the statistics of the table is version 1, we must analyze all columns to overwrites all of old statistics.
 	mustAllColumns := !statsHandle.CheckAnalyzeVersion(tbl.TableInfo, physicalIDs, &ver)
+
 	astColsInfo, _, err := b.getFullAnalyzeColumnsInfo(tbl, as.ColumnChoice, astColList, &predicateCols, &mustAnalyzedCols, mustAllColumns, true)
 	if err != nil {
 		return nil, err
@@ -2474,6 +2506,7 @@ func (b *PlanBuilder) buildAnalyzeFullSamplingTask(
 	for physicalID, opts := range optionsMap {
 		rsOptionsMap[physicalID] = opts
 	}
+
 	for i, id := range physicalIDs {
 		physicalID := id
 		if id == tbl.TableInfo.ID {
@@ -2482,7 +2515,7 @@ func (b *PlanBuilder) buildAnalyzeFullSamplingTask(
 		info := AnalyzeInfo{
 			DBName:        tbl.Schema.O,
 			TableName:     tbl.Name.O,
-			PartitionName: names[i],
+			PartitionName: partitionNames[i],
 			TableID:       statistics.AnalyzeTableID{TableID: tbl.TableInfo.ID, PartitionID: id},
 			Incremental:   false,
 			StatsVersion:  version,
@@ -2541,9 +2574,10 @@ func (b *PlanBuilder) buildAnalyzeFullSamplingTask(
 			newTask.ColsInfo = append(newTask.ColsInfo, extraCol)
 			newTask.HandleCols = &IntHandleCols{col: colInfoToColumn(extraCol, len(newTask.ColsInfo)-1)}
 		}
-		taskSlice = append(taskSlice, newTask)
+		tasks = append(tasks, newTask)
 	}
-	return taskSlice, nil
+
+	return tasks, nil
 }
 
 func (b *PlanBuilder) genV2AnalyzeOptions(
@@ -2708,10 +2742,13 @@ func mergeColumnList(choice1 model.ColumnChoice, list1 []*model.ColumnInfo, choi
 	return choice2, list2
 }
 
+// buildAnalyzeTable constructs anylyze tasks for each table.
 func (b *PlanBuilder) buildAnalyzeTable(as *ast.AnalyzeTableStmt, opts map[ast.AnalyzeOptionType]uint64, version int) (Plan, error) {
 	p := &Analyze{Opts: opts}
 	p.OptionsMap = make(map[int64]V2AnalyzeOptions)
 	usePersistedOptions := variable.PersistAnalyzeOptions.Load()
+
+	// Construct tasks for each table.
 	for _, tbl := range as.TableNames {
 		if tbl.TableInfo.IsView() {
 			return nil, errors.Errorf("analyze view %s is not supported now", tbl.Name.O)
@@ -2719,15 +2756,16 @@ func (b *PlanBuilder) buildAnalyzeTable(as *ast.AnalyzeTableStmt, opts map[ast.A
 		if tbl.TableInfo.IsSequence() {
 			return nil, errors.Errorf("analyze sequence %s is not supported now", tbl.Name.O)
 		}
+
 		idxInfo, colInfo := getColsInfo(tbl)
-		physicalIDs, names, err := GetPhysicalIDsAndPartitionNames(tbl.TableInfo, as.PartitionNames)
+		physicalIDs, partitionNames, err := GetPhysicalIDsAndPartitionNames(tbl.TableInfo, as.PartitionNames)
 		if err != nil {
 			return nil, err
 		}
 		var commonHandleInfo *model.IndexInfo
 		// If we want to analyze this table with analyze version 2 but the existing stats is version 1 and stats feedback is enabled,
 		// we will switch back to analyze version 1.
-		if statistics.FeedbackProbability.Load() > 0 && version == 2 {
+		if statistics.FeedbackProbability.Load() > 0 && version == statistics.Version2 {
 			statsHandle := domain.GetDomain(b.ctx).StatsHandle()
 			versionIsSame := statsHandle.CheckAnalyzeVersion(tbl.TableInfo, physicalIDs, &version)
 			if !versionIsSame {
@@ -2736,18 +2774,21 @@ func (b *PlanBuilder) buildAnalyzeTable(as *ast.AnalyzeTableStmt, opts map[ast.A
 					"If you want to switch to version 2 statistics, please first disable query feedback by setting feedback-probability to 0.0 in the config file.", tbl.Name))
 			}
 		}
+
 		if version == statistics.Version2 {
-			p.ColTasks, err = b.buildAnalyzeFullSamplingTask(as, p.ColTasks, physicalIDs, names, tbl, version, usePersistedOptions, p.OptionsMap)
+			p.ColTasks, err = b.buildAnalyzeFullSamplingTask(as, p.ColTasks, physicalIDs, partitionNames, tbl, version, usePersistedOptions, p.OptionsMap)
 			if err != nil {
 				return nil, err
 			}
 			continue
 		}
+
+		// Version 1 analyze.
 		if as.ColumnChoice == model.PredicateColumns {
-			return nil, errors.Errorf("Only the analyze version 2 supports analyzing predicate columns")
+			return nil, errors.Errorf("Only the version 2 of analyze supports analyzing predicate columns")
 		}
 		if as.ColumnChoice == model.ColumnList {
-			return nil, errors.Errorf("Only the analyze version 2 supports analyzing the specified columns")
+			return nil, errors.Errorf("Only the version 2 of analyze supports analyzing the specified columns")
 		}
 		for _, idx := range idxInfo {
 			// For prefix common handle. We don't use analyze mixed to handle it with columns. Because the full value
@@ -2767,7 +2808,7 @@ func (b *PlanBuilder) buildAnalyzeTable(as *ast.AnalyzeTableStmt, opts map[ast.A
 				info := AnalyzeInfo{
 					DBName:        tbl.Schema.O,
 					TableName:     tbl.Name.O,
-					PartitionName: names[i],
+					PartitionName: partitionNames[i],
 					TableID:       statistics.AnalyzeTableID{TableID: tbl.TableInfo.ID, PartitionID: id},
 					Incremental:   as.Incremental,
 					StatsVersion:  version,
@@ -2788,7 +2829,7 @@ func (b *PlanBuilder) buildAnalyzeTable(as *ast.AnalyzeTableStmt, opts map[ast.A
 				info := AnalyzeInfo{
 					DBName:        tbl.Schema.O,
 					TableName:     tbl.Name.O,
-					PartitionName: names[i],
+					PartitionName: partitionNames[i],
 					TableID:       statistics.AnalyzeTableID{TableID: tbl.TableInfo.ID, PartitionID: id},
 					Incremental:   as.Incremental,
 					StatsVersion:  version,
@@ -2803,6 +2844,7 @@ func (b *PlanBuilder) buildAnalyzeTable(as *ast.AnalyzeTableStmt, opts map[ast.A
 			}
 		}
 	}
+
 	return p, nil
 }
 
@@ -2971,7 +3013,9 @@ var analyzeOptionDefaultV2 = map[ast.AnalyzeOptionType]uint64{
 	ast.AnalyzeOptSampleRate:    math.Float64bits(-1),
 }
 
-func parseAnalyzeOptionsV2(opts []ast.AnalyzeOpt) (map[ast.AnalyzeOptionType]uint64, error) {
+// This function very similar to handleAnalyzeOptions, but it's used for analyze version 2.
+// Remove this function after we remove the support of analyze version 1.
+func handleAnalyzeOptionsV2(opts []ast.AnalyzeOpt) (map[ast.AnalyzeOptionType]uint64, error) {
 	optMap := make(map[ast.AnalyzeOptionType]uint64, len(analyzeOptionDefault))
 	sampleNum, sampleRate := uint64(0), 0.0
 	for _, opt := range opts {
@@ -3009,6 +3053,7 @@ func parseAnalyzeOptionsV2(opts []ast.AnalyzeOpt) (map[ast.AnalyzeOptionType]uin
 	if sampleNum > 0 && sampleRate > 0 {
 		return nil, errors.Errorf("You can only either set the value of the sample num or set the value of the sample rate. Don't set both of them")
 	}
+
 	return optMap, nil
 }
 
@@ -3074,7 +3119,8 @@ func handleAnalyzeOptions(opts []ast.AnalyzeOpt, statsVer int) (map[ast.AnalyzeO
 	if sampleNum > 0 && sampleRate > 0 {
 		return nil, errors.Errorf("You can only either set the value of the sample num or set the value of the sample rate. Don't set both of them")
 	}
-	if optMap[ast.AnalyzeOptCMSketchWidth]*optMap[ast.AnalyzeOptCMSketchDepth] > CMSketchSizeLimit {
+	// Only version 1 has cmsketch.
+	if statsVer == statistics.Version1 && optMap[ast.AnalyzeOptCMSketchWidth]*optMap[ast.AnalyzeOptCMSketchDepth] > CMSketchSizeLimit {
 		return nil, errors.Errorf("cm sketch size(depth * width) should not larger than %d", CMSketchSizeLimit)
 	}
 	return optMap, nil
diff --git a/planner/core/planbuilder_test.go b/planner/core/planbuilder_test.go
index 8608f50eaf..cf18bb5e27 100644
--- a/planner/core/planbuilder_test.go
+++ b/planner/core/planbuilder_test.go
@@ -542,7 +542,7 @@ func checkDeepClonedCore(v1, v2 reflect.Value, path string, whiteList []string,
 	return nil
 }
 
-func TestHandleAnalyzeOptions(t *testing.T) {
+func TestHandleAnalyzeOptionsV1AndV2(t *testing.T) {
 	require.Equal(t, len(analyzeOptionDefault), len(analyzeOptionDefaultV2), "analyzeOptionDefault and analyzeOptionDefaultV2 should have the same length")
 
 	tests := []struct {
@@ -559,7 +559,7 @@ func TestHandleAnalyzeOptions(t *testing.T) {
 					Value: ast.NewValueExpr(16384+1, "", ""),
 				},
 			},
-			statsVer:    1,
+			statsVer:    statistics.Version1,
 			ExpectedErr: "Value of analyze option TOPN should not be larger than 16384",
 		},
 		{
@@ -570,7 +570,7 @@ func TestHandleAnalyzeOptions(t *testing.T) {
 					Value: ast.NewValueExpr(1, "", ""),
 				},
 			},
-			statsVer:    1,
+			statsVer:    statistics.Version1,
 			ExpectedErr: "Version 1's statistics doesn't support the SAMPLERATE option, please set tidb_analyze_version to 2",
 		},
 		{
@@ -581,7 +581,7 @@ func TestHandleAnalyzeOptions(t *testing.T) {
 					Value: ast.NewValueExpr(2, "", ""),
 				},
 			},
-			statsVer:    2,
+			statsVer:    statistics.Version2,
 			ExpectedErr: "Value of analyze option SAMPLERATE should not larger than 1.000000, and should be greater than 0",
 		},
 		{
@@ -607,7 +607,7 @@ func TestHandleAnalyzeOptions(t *testing.T) {
 					Value: ast.NewValueExpr(0.1, "", ""),
 				},
 			},
-			statsVer:    2,
+			statsVer:    statistics.Version2,
 			ExpectedErr: "ou can only either set the value of the sample num or set the value of the sample rate. Don't set both of them",
 		},
 		{
@@ -622,7 +622,7 @@ func TestHandleAnalyzeOptions(t *testing.T) {
 					Value: ast.NewValueExpr(2048, "", ""),
 				},
 			},
-			statsVer:    2,
+			statsVer:    statistics.Version1,
 			ExpectedErr: "cm sketch size(depth * width) should not larger than 1258291",
 		},
 	}
@@ -636,6 +636,16 @@ func TestHandleAnalyzeOptions(t *testing.T) {
 			} else {
 				require.NoError(t, err)
 			}
+
+			if tt.statsVer == statistics.Version2 {
+				_, err := handleAnalyzeOptionsV2(tt.opts)
+				if tt.ExpectedErr != "" {
+					require.Error(t, err)
+					require.Contains(t, err.Error(), tt.ExpectedErr)
+				} else {
+					require.NoError(t, err)
+				}
+			}
 		})
 	}
 }