planner: create variables for recognizing over and underestimation risk (#62910)

ref pingcap/tidb#59333
This commit is contained in:
Isabel Chen
2025-08-09 10:01:45 -07:00
committed by GitHub
parent 04590cf4a5
commit 1a88fd99db
12 changed files with 143 additions and 127 deletions

View File

@ -170,7 +170,7 @@ func crossEstimateRowCount(sctx planctx.PlanContext,
if idxExists && len(idxIDs) > 0 {
idxID = idxIDs[0]
}
rangeCounts, _, ok := getColumnRangeCounts(sctx, colUniqueID, ranges, dsTableStats.HistColl, idxID)
rangeCounts, _, _, ok := getColumnRangeCounts(sctx, colUniqueID, ranges, dsTableStats.HistColl, idxID)
if !ok {
return 0, false, corr
}
@ -180,7 +180,7 @@ func crossEstimateRowCount(sctx planctx.PlanContext,
}
var rangeCount float64
if idxExists {
rangeCount, _, err = GetRowCountByIndexRanges(sctx, dsTableStats.HistColl, idxID, convertedRanges, nil)
rangeCount, _, _, err = GetRowCountByIndexRanges(sctx, dsTableStats.HistColl, idxID, convertedRanges, nil)
} else {
rangeCount, err = GetRowCountByColumnRanges(sctx, dsTableStats.HistColl, colUniqueID, convertedRanges)
}
@ -196,30 +196,30 @@ func crossEstimateRowCount(sctx planctx.PlanContext,
}
// getColumnRangeCounts estimates row count for each range respectively.
func getColumnRangeCounts(sctx planctx.PlanContext, colID int64, ranges []*ranger.Range, histColl *statistics.HistColl, idxID int64) ([]float64, float64, bool) {
func getColumnRangeCounts(sctx planctx.PlanContext, colID int64, ranges []*ranger.Range, histColl *statistics.HistColl, idxID int64) (rangeCounts []float64, minCount float64, maxCount float64, ok bool) {
var err error
var count, corrCount float64
rangeCounts := make([]float64, len(ranges))
var count float64
rangeCounts = make([]float64, len(ranges))
for i, ran := range ranges {
if idxID >= 0 {
idxHist := histColl.GetIdx(idxID)
if statistics.IndexStatsIsInvalid(sctx, idxHist, histColl, idxID) {
return nil, 0, false
return nil, 0, 0, false
}
count, corrCount, err = GetRowCountByIndexRanges(sctx, histColl, idxID, []*ranger.Range{ran}, nil)
count, minCount, maxCount, err = GetRowCountByIndexRanges(sctx, histColl, idxID, []*ranger.Range{ran}, nil)
} else {
colHist := histColl.GetCol(colID)
if statistics.ColumnStatsIsInvalid(colHist, sctx, histColl, colID) {
return nil, 0, false
return nil, 0, 0, false
}
count, err = GetRowCountByColumnRanges(sctx, histColl, colID, []*ranger.Range{ran})
}
if err != nil {
return nil, 0, false
return nil, 0, 0, false
}
rangeCounts[i] = count
}
return rangeCounts, corrCount, true
return rangeCounts, minCount, maxCount, true
}
// convertRangeFromExpectedCnt builds new ranges used to estimate row count we need to scan in table scan before finding specified

View File

@ -412,7 +412,7 @@ func ColumnEqualRowCount(sctx planctx.PlanContext, t *statistics.Table, value ty
// getPseudoRowCountWithPartialStats calculates the row count if there are no statistics on the index, but there are column stats available.
func getPseudoRowCountWithPartialStats(sctx planctx.PlanContext, coll *statistics.HistColl, indexRanges []*ranger.Range,
tableRowCount float64, idxCols []*expression.Column) (totalCount float64, corrCount float64, err error) {
tableRowCount float64, idxCols []*expression.Column) (totalCount float64, maxCount float64, err error) {
if tableRowCount == 0 {
return 0, 0, nil
}
@ -433,7 +433,7 @@ func getPseudoRowCountWithPartialStats(sctx planctx.PlanContext, coll *statistic
colID int64
)
totalCount = float64(0)
corrCount = float64(0)
maxCount = float64(0)
for _, indexRange := range indexRanges {
selectivity := float64(1.0)
corrSelectivity := float64(1.0)
@ -456,8 +456,8 @@ func getPseudoRowCountWithPartialStats(sctx planctx.PlanContext, coll *statistic
corrSelectivity = min(corrSelectivity, tempSelectivity)
}
totalCount += selectivity * tableRowCount
corrCount += corrSelectivity * tableRowCount
maxCount += corrSelectivity * tableRowCount
}
totalCount = mathutil.Clamp(totalCount, 1, tableRowCount)
return totalCount, corrCount, nil
return totalCount, maxCount, nil
}

View File

@ -41,7 +41,7 @@ import (
// GetRowCountByIndexRanges estimates the row count by a slice of Range.
// idxCols used when index statistics are invalid, because coll may not have index info, can be nil whenever index statistics are valid.
func GetRowCountByIndexRanges(sctx planctx.PlanContext, coll *statistics.HistColl, idxID int64, indexRanges []*ranger.Range, idxCols []*expression.Column) (result float64, corrResult float64, err error) {
func GetRowCountByIndexRanges(sctx planctx.PlanContext, coll *statistics.HistColl, idxID int64, indexRanges []*ranger.Range, idxCols []*expression.Column) (result float64, minResult float64, maxResult float64, err error) {
var name string
if sctx.GetSessionVars().StmtCtx.EnableOptimizerDebugTrace {
debugtrace.EnterContextCommon(sctx)
@ -63,10 +63,9 @@ func GetRowCountByIndexRanges(sctx planctx.PlanContext, coll *statistics.HistCol
}
}
recordUsedItemStatsStatus(sctx, idx, coll.PhysicalID, idxID)
corrResult = float64(0)
if statistics.IndexStatsIsInvalid(sctx, idx, coll, idxID) {
if hasColumnStats(sctx, coll, idxCols) {
result, corrResult, err = getPseudoRowCountWithPartialStats(sctx, coll, indexRanges, float64(coll.RealtimeCount), idxCols)
result, maxResult, err = getPseudoRowCountWithPartialStats(sctx, coll, indexRanges, float64(coll.RealtimeCount), idxCols)
} else {
colsLen := -1
if idx != nil && idx.Info.Unique {
@ -77,7 +76,7 @@ func GetRowCountByIndexRanges(sctx planctx.PlanContext, coll *statistics.HistCol
ceTraceRange(sctx, coll.PhysicalID, colNames, indexRanges, "Index Stats-Pseudo", uint64(result))
}
}
return result, corrResult, err
return result, minResult, maxResult, err
}
realtimeCnt, modifyCount := coll.GetScaledRealtimeAndModifyCnt(idx)
if sctx.GetSessionVars().StmtCtx.EnableOptimizerDebugTrace {
@ -90,12 +89,12 @@ func GetRowCountByIndexRanges(sctx planctx.PlanContext, coll *statistics.HistCol
if idx.CMSketch != nil && idx.StatsVer == statistics.Version1 {
result, err = getIndexRowCountForStatsV1(sctx, coll, idxID, indexRanges)
} else {
result, corrResult, err = getIndexRowCountForStatsV2(sctx, idx, coll, indexRanges, realtimeCnt, modifyCount)
result, minResult, maxResult, err = getIndexRowCountForStatsV2(sctx, idx, coll, indexRanges, realtimeCnt, modifyCount)
}
if sc.EnableOptimizerCETrace {
ceTraceRange(sctx, coll.PhysicalID, colNames, indexRanges, "Index Stats", uint64(result))
}
return result, corrResult, errors.Trace(err)
return result, minResult, maxResult, errors.Trace(err)
}
func getIndexRowCountForStatsV1(sctx planctx.PlanContext, coll *statistics.HistColl, idxID int64, indexRanges []*ranger.Range) (float64, error) {
@ -125,7 +124,7 @@ func getIndexRowCountForStatsV1(sctx planctx.PlanContext, coll *statistics.HistC
// values in this case.
if rangePosition == 0 || isSingleColIdxNullRange(idx, ran) {
realtimeCnt, modifyCount := coll.GetScaledRealtimeAndModifyCnt(idx)
count, _, err := getIndexRowCountForStatsV2(sctx, idx, nil, []*ranger.Range{ran}, realtimeCnt, modifyCount)
count, _, _, err := getIndexRowCountForStatsV2(sctx, idx, nil, []*ranger.Range{ran}, realtimeCnt, modifyCount)
if err != nil {
return 0, errors.Trace(err)
}
@ -189,7 +188,7 @@ func getIndexRowCountForStatsV1(sctx planctx.PlanContext, coll *statistics.HistC
// prefer index stats over column stats
if idxIDs, ok := coll.ColUniqueID2IdxIDs[colUniqueID]; ok && len(idxIDs) > 0 {
idxID := idxIDs[0]
count, _, err = GetRowCountByIndexRanges(sctx, coll, idxID, []*ranger.Range{&rang}, nil)
count, _, _, err = GetRowCountByIndexRanges(sctx, coll, idxID, []*ranger.Range{&rang}, nil)
} else {
count, err = GetRowCountByColumnRanges(sctx, coll, colUniqueID, []*ranger.Range{&rang})
}
@ -223,7 +222,7 @@ func isSingleColIdxNullRange(idx *statistics.Index, ran *ranger.Range) bool {
}
// It uses the modifyCount to validate, and realtimeRowCount to adjust the influence of modifications on the table.
func getIndexRowCountForStatsV2(sctx planctx.PlanContext, idx *statistics.Index, coll *statistics.HistColl, indexRanges []*ranger.Range, realtimeRowCount, modifyCount int64) (totalCount float64, corrCount float64, err error) {
func getIndexRowCountForStatsV2(sctx planctx.PlanContext, idx *statistics.Index, coll *statistics.HistColl, indexRanges []*ranger.Range, realtimeRowCount, modifyCount int64) (totalCount, minCount, maxCount float64, err error) {
sc := sctx.GetSessionVars().StmtCtx
debugTrace := sc.EnableOptimizerDebugTrace
if debugTrace {
@ -237,12 +236,12 @@ func getIndexRowCountForStatsV2(sctx planctx.PlanContext, idx *statistics.Index,
lb, err = codec.EncodeKey(sc.TimeZone(), nil, indexRange.LowVal...)
err = sc.HandleError(err)
if err != nil {
return 0, 0, err
return 0, 0, 0, err
}
rb, err = codec.EncodeKey(sc.TimeZone(), nil, indexRange.HighVal...)
err = sc.HandleError(err)
if err != nil {
return 0, 0, err
return 0, 0, 0, err
}
if debugTrace {
debugTraceStartEstimateRange(sctx, indexRange, lb, rb, totalCount)
@ -301,14 +300,15 @@ func getIndexRowCountForStatsV2(sctx planctx.PlanContext, idx *statistics.Index,
// Due to the limitation of calcFraction and convertDatumToScalar, the histogram actually won't estimate anything.
// If the first column's range is point.
if rangePosition := getOrdinalOfRangeCond(sc, indexRange); rangePosition > 0 && idx.StatsVer >= statistics.Version2 && coll != nil {
var expBackoffSel, corrSel float64
expBackoffSel, corrSel, expBackoffSuccess, err = expBackoffEstimation(sctx, idx, coll, indexRange)
var expBackoffSel, minSel, maxSel float64
expBackoffSel, minSel, maxSel, expBackoffSuccess, err = expBackoffEstimation(sctx, idx, coll, indexRange)
if err != nil {
return 0, 0, err
return 0, 0, 0, err
}
if expBackoffSuccess {
expBackoffCnt := expBackoffSel * idx.TotalRowCount()
corrCnt := corrSel * idx.TotalRowCount()
minCnt := minSel * idx.TotalRowCount()
maxCnt := maxSel * idx.TotalRowCount()
upperLimit := expBackoffCnt
// Use the multi-column stats to calculate the max possible row count of [l, r)
@ -335,7 +335,8 @@ func getIndexRowCountForStatsV2(sctx planctx.PlanContext, idx *statistics.Index,
expBackoffCnt = upperLimit
}
count += expBackoffCnt
corrCount += corrCnt
minCount += minCnt
maxCount += maxCnt
}
}
if !expBackoffSuccess {
@ -345,7 +346,8 @@ func getIndexRowCountForStatsV2(sctx planctx.PlanContext, idx *statistics.Index,
// If the current table row count has changed, we should scale the row count accordingly.
increaseFactor := idx.GetIncreaseFactor(realtimeRowCount)
count *= increaseFactor
corrCount *= increaseFactor
minCount *= increaseFactor
maxCount *= increaseFactor
// handling the out-of-range part
if (outOfRangeOnIndex(idx, l) && !(isSingleColIdx && lowIsNull)) || outOfRangeOnIndex(idx, r) {
@ -387,7 +389,7 @@ func getIndexRowCountForStatsV2(sctx planctx.PlanContext, idx *statistics.Index,
// Don't allow the final result to go below 1 row
totalCount = mathutil.Clamp(totalCount, 1, float64(realtimeRowCount))
}
return totalCount, corrCount, nil
return totalCount, minCount, maxCount, nil
}
var nullKeyBytes, _ = codec.EncodeKey(time.UTC, nil, types.NewDatum(nil))
@ -476,7 +478,7 @@ func equalRowCountOnIndex(sctx planctx.PlanContext, idx *statistics.Index, b []b
}
// expBackoffEstimation estimate the multi-col cases following the Exponential Backoff. See comment below for details.
func expBackoffEstimation(sctx planctx.PlanContext, idx *statistics.Index, coll *statistics.HistColl, indexRange *ranger.Range) (sel float64, corrSel float64, success bool, err error) {
func expBackoffEstimation(sctx planctx.PlanContext, idx *statistics.Index, coll *statistics.HistColl, indexRange *ranger.Range) (sel float64, minSel float64, maxSel float64, success bool, err error) {
if sctx.GetSessionVars().StmtCtx.EnableOptimizerDebugTrace {
debugtrace.EnterContextCommon(sctx)
defer func() {
@ -497,6 +499,7 @@ func expBackoffEstimation(sctx planctx.PlanContext, idx *statistics.Index, coll
}
colsIDs := coll.Idx2ColUniqueIDs[idx.Histogram.ID]
singleColumnEstResults := make([]float64, 0, len(indexRange.LowVal))
minSel = float64(1)
// The following codes uses Exponential Backoff to reduce the impact of independent assumption. It works like:
// 1. Calc the selectivity of each column.
// 2. Sort them and choose the first 4 most selective filter and the corresponding selectivity is sel_1, sel_2, sel_3, sel_4 where i < j => sel_i < sel_j.
@ -532,7 +535,7 @@ func expBackoffEstimation(sctx planctx.PlanContext, idx *statistics.Index, coll
continue
}
foundStats = true
count, _, err = GetRowCountByIndexRanges(sctx, coll, idxID, tmpRan, nil)
count, _, _, err = GetRowCountByIndexRanges(sctx, coll, idxID, tmpRan, nil)
if err == nil {
break
}
@ -544,9 +547,10 @@ func expBackoffEstimation(sctx planctx.PlanContext, idx *statistics.Index, coll
continue
}
if err != nil {
return 0, 0, false, err
return 0, 0, 0, false, err
}
singleColumnEstResults = append(singleColumnEstResults, selectivity)
minSel *= selectivity
}
// Sort them.
slices.Sort(singleColumnEstResults)
@ -556,9 +560,9 @@ func expBackoffEstimation(sctx planctx.PlanContext, idx *statistics.Index, coll
l = 0
})
if l == 1 {
return singleColumnEstResults[0], singleColumnEstResults[0], true, nil
return singleColumnEstResults[0], singleColumnEstResults[0], singleColumnEstResults[0], true, nil
} else if l == 0 {
return 0, 0, false, nil
return 0, 0, 0, false, nil
}
// Do not allow the exponential backoff to go below the available index bound. If the number of predicates
// is less than the number of index columns - use 90% of the bound to differentiate a subset from full index match.
@ -571,21 +575,23 @@ func expBackoffEstimation(sctx planctx.PlanContext, idx *statistics.Index, coll
if l < len(idx.Info.Columns) {
idxLowBound /= 0.9
}
// corrSel is the selectivity of the most filtering column
corrSel = max(idxLowBound, singleColumnEstResults[0])
// maxSel assumes correlation, so is the selectivity of the most filtering column
maxSel = max(idxLowBound, singleColumnEstResults[0])
// minSel assumes independence between columns, so is the product of all single column selectivities.
minSel = max(idxLowBound, minSel)
minTwoCol := min(singleColumnEstResults[0], singleColumnEstResults[1], idxLowBound)
multTwoCol := singleColumnEstResults[0] * math.Sqrt(singleColumnEstResults[1])
if l == 2 {
return max(minTwoCol, multTwoCol), corrSel, true, nil
return max(minTwoCol, multTwoCol), minSel, maxSel, true, nil
}
minThreeCol := min(minTwoCol, singleColumnEstResults[2])
multThreeCol := multTwoCol * math.Sqrt(math.Sqrt(singleColumnEstResults[2]))
if l == 3 {
return max(minThreeCol, multThreeCol), corrSel, true, nil
return max(minThreeCol, multThreeCol), minSel, maxSel, true, nil
}
minFourCol := min(minThreeCol, singleColumnEstResults[3])
multFourCol := multThreeCol * math.Sqrt(math.Sqrt(math.Sqrt(singleColumnEstResults[3])))
return max(minFourCol, multFourCol), corrSel, true, nil
return max(minFourCol, multFourCol), minSel, maxSel, true, nil
}
// outOfRangeOnIndex checks if the datum is out of the range.

View File

@ -203,12 +203,13 @@ func Selectivity(
if err != nil {
return 0, nil, errors.Trace(err)
}
cnt, corrCnt, err := GetRowCountByIndexRanges(ctx, coll, id, ranges, nil)
cnt, minCnt, maxCnt, err := GetRowCountByIndexRanges(ctx, coll, id, ranges, nil)
if err != nil {
return 0, nil, errors.Trace(err)
}
selectivity := cnt / float64(coll.RealtimeCount)
corrSelectivity := corrCnt / float64(coll.RealtimeCount)
minSelectivity := minCnt / float64(coll.RealtimeCount)
maxSelectivity := maxCnt / float64(coll.RealtimeCount)
nodes = append(nodes, &StatsNode{
Tp: IndexType,
ID: id,
@ -216,7 +217,8 @@ func Selectivity(
Ranges: ranges,
numCols: len(idxStats.Info.Columns),
Selectivity: selectivity,
CorrSelectivity: corrSelectivity,
MinSelectivity: minSelectivity,
MaxSelectivity: maxSelectivity,
partCover: partCover,
minAccessCondsForDNFCond: minAccessCondsForDNFCond,
})
@ -553,10 +555,12 @@ type StatsNode struct {
mask int64
// Selectivity indicates the Selectivity of this column/index.
Selectivity float64
// CorrSelectivity indicates the Selectivity of this column/index with correlated column.
// That is - it is the selectivity assuming the most filtering index column only, and all other
// columns are correlated with this column.
CorrSelectivity float64
// MinSelectivity indicates the Selectivity of this column/index for the least rows that can qualify.
// It takes into account situations that would decrease the row count, such as fully independent columns.
MinSelectivity float64
// MaxSelectivity indicates the Selectivity of this column/index for the most rows that can qualify.
// It takes into account situations that would increase the row count, such as correlated columns.
MaxSelectivity float64
// numCols is the number of columns contained in the index or column(which is always 1).
numCols int
// partCover indicates whether the bit in the mask is for a full cover or partial cover. It is only true

View File

@ -253,11 +253,11 @@ func TestEstimationForUnknownValues(t *testing.T) {
require.Equal(t, 4.7, count)
idxID := table.Meta().Indices[0].ID
count, _, err = cardinality.GetRowCountByIndexRanges(sctx, &statsTbl.HistColl, idxID, getRange(30, 30), nil)
count, _, _, err = cardinality.GetRowCountByIndexRanges(sctx, &statsTbl.HistColl, idxID, getRange(30, 30), nil)
require.NoError(t, err)
require.Equal(t, 0.1, count)
count, _, err = cardinality.GetRowCountByIndexRanges(sctx, &statsTbl.HistColl, idxID, getRange(9, 30), nil)
count, _, _, err = cardinality.GetRowCountByIndexRanges(sctx, &statsTbl.HistColl, idxID, getRange(9, 30), nil)
require.NoError(t, err)
require.Equal(t, 4.5, count)
@ -287,7 +287,7 @@ func TestEstimationForUnknownValues(t *testing.T) {
require.Equal(t, 1.0, count)
idxID = table.Meta().Indices[0].ID
count, _, err = cardinality.GetRowCountByIndexRanges(sctx, &statsTbl.HistColl, idxID, getRange(2, 2), nil)
count, _, _, err = cardinality.GetRowCountByIndexRanges(sctx, &statsTbl.HistColl, idxID, getRange(2, 2), nil)
require.NoError(t, err)
require.Equal(t, 0.0, count)
}
@ -440,11 +440,11 @@ func TestEstimationUniqueKeyEqualConds(t *testing.T) {
sctx := mock.NewContext()
idxID := table.Meta().Indices[0].ID
count, _, err := cardinality.GetRowCountByIndexRanges(sctx, &statsTbl.HistColl, idxID, getRange(7, 7), nil)
count, _, _, err := cardinality.GetRowCountByIndexRanges(sctx, &statsTbl.HistColl, idxID, getRange(7, 7), nil)
require.NoError(t, err)
require.Equal(t, 1.0, count)
count, _, err = cardinality.GetRowCountByIndexRanges(sctx, &statsTbl.HistColl, idxID, getRange(6, 6), nil)
count, _, _, err = cardinality.GetRowCountByIndexRanges(sctx, &statsTbl.HistColl, idxID, getRange(6, 6), nil)
require.NoError(t, err)
require.Equal(t, 1.0, count)
@ -1075,12 +1075,12 @@ func TestIssue39593(t *testing.T) {
sctx := testKit.Session()
idxID := tblInfo.Indices[0].ID
vals := []int64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20}
count, _, err := cardinality.GetRowCountByIndexRanges(sctx.GetPlanCtx(), &statsTbl.HistColl, idxID, getRanges(vals, vals), nil)
count, _, _, err := cardinality.GetRowCountByIndexRanges(sctx.GetPlanCtx(), &statsTbl.HistColl, idxID, getRanges(vals, vals), nil)
require.NoError(t, err)
// estimated row count without any changes, use range to reduce test flakiness
require.InDelta(t, float64(462.6), count, float64(1))
statsTbl.RealtimeCount *= 10
count, _, err = cardinality.GetRowCountByIndexRanges(sctx.GetPlanCtx(), &statsTbl.HistColl, idxID, getRanges(vals, vals), nil)
count, _, _, err = cardinality.GetRowCountByIndexRanges(sctx.GetPlanCtx(), &statsTbl.HistColl, idxID, getRanges(vals, vals), nil)
require.NoError(t, err)
// estimated row count after mock modify on the table, use range to reduce test flakiness
require.InDelta(t, float64(3702.6), count, float64(1))
@ -1569,15 +1569,15 @@ func TestRiskEqSkewRatio(t *testing.T) {
// Search for the value "6" which will not be found in the histogram buckets, and since
// there are NO topN values - the value will be considered skewed based upon skew ratio.
testKit.MustExec("set @@session.tidb_opt_risk_eq_skew_ratio = 0")
count, _, err := cardinality.GetRowCountByIndexRanges(sctx.GetPlanCtx(), &statsTbl.HistColl, idxID, getRange(6, 6), nil)
count, _, _, err := cardinality.GetRowCountByIndexRanges(sctx.GetPlanCtx(), &statsTbl.HistColl, idxID, getRange(6, 6), nil)
require.NoError(t, err)
testKit.MustExec("set @@session.tidb_opt_risk_eq_skew_ratio = 0.5")
count2, _, err2 := cardinality.GetRowCountByIndexRanges(sctx.GetPlanCtx(), &statsTbl.HistColl, idxID, getRange(6, 6), nil)
count2, _, _, err2 := cardinality.GetRowCountByIndexRanges(sctx.GetPlanCtx(), &statsTbl.HistColl, idxID, getRange(6, 6), nil)
require.NoError(t, err2)
// Result of count2 should be larger than count because the risk ratio is higher
require.Less(t, count, count2)
testKit.MustExec("set @@session.tidb_opt_risk_eq_skew_ratio = 1")
count3, _, err3 := cardinality.GetRowCountByIndexRanges(sctx.GetPlanCtx(), &statsTbl.HistColl, idxID, getRange(6, 6), nil)
count3, _, _, err3 := cardinality.GetRowCountByIndexRanges(sctx.GetPlanCtx(), &statsTbl.HistColl, idxID, getRange(6, 6), nil)
require.NoError(t, err3)
// Result of count3 should be larger because the risk ratio is higher
require.Less(t, count2, count3)
@ -1589,27 +1589,27 @@ func TestRiskEqSkewRatio(t *testing.T) {
require.NoError(t, h.DumpStatsDeltaToKV(true))
// Rerun tests with 1 value in the TopN
statsTbl = h.GetTableStats(tb.Meta())
count, _, err = cardinality.GetRowCountByIndexRanges(sctx.GetPlanCtx(), &statsTbl.HistColl, idxID, getRange(6, 6), nil)
count, _, _, err = cardinality.GetRowCountByIndexRanges(sctx.GetPlanCtx(), &statsTbl.HistColl, idxID, getRange(6, 6), nil)
require.NoError(t, err)
testKit.MustExec("set @@session.tidb_opt_risk_eq_skew_ratio = 0.5")
count2, _, err2 = cardinality.GetRowCountByIndexRanges(sctx.GetPlanCtx(), &statsTbl.HistColl, idxID, getRange(6, 6), nil)
count2, _, _, err2 = cardinality.GetRowCountByIndexRanges(sctx.GetPlanCtx(), &statsTbl.HistColl, idxID, getRange(6, 6), nil)
require.NoError(t, err2)
// Result of count2 should be larger than count because the risk ratio is higher
require.Less(t, count, count2)
testKit.MustExec("set @@session.tidb_opt_risk_eq_skew_ratio = 1")
count3, _, err3 = cardinality.GetRowCountByIndexRanges(sctx.GetPlanCtx(), &statsTbl.HistColl, idxID, getRange(6, 6), nil)
count3, _, _, err3 = cardinality.GetRowCountByIndexRanges(sctx.GetPlanCtx(), &statsTbl.HistColl, idxID, getRange(6, 6), nil)
require.NoError(t, err3)
// Result of count3 should be larger than count because the risk ratio is higher
require.Less(t, count2, count3)
// Repeat the prior test by setting the global variable instead of the session variable. This should have no effect.
testKit.MustExec("set @@global.tidb_opt_risk_eq_skew_ratio = 0.5")
count4, _, err4 := cardinality.GetRowCountByIndexRanges(sctx.GetPlanCtx(), &statsTbl.HistColl, idxID, getRange(6, 6), nil)
count4, _, _, err4 := cardinality.GetRowCountByIndexRanges(sctx.GetPlanCtx(), &statsTbl.HistColl, idxID, getRange(6, 6), nil)
require.NoError(t, err4)
require.Less(t, count2, count4)
// Repeat the prior test by setting the session variable to the default. Count4 should inherit the global
// variable and be less than count3.
testKit.MustExec("set @@session.tidb_opt_risk_eq_skew_ratio = default")
count4, _, err4 = cardinality.GetRowCountByIndexRanges(sctx.GetPlanCtx(), &statsTbl.HistColl, idxID, getRange(6, 6), nil)
count4, _, _, err4 = cardinality.GetRowCountByIndexRanges(sctx.GetPlanCtx(), &statsTbl.HistColl, idxID, getRange(6, 6), nil)
require.NoError(t, err4)
require.Less(t, count4, count3)
// Reset global variable to default.
@ -1641,27 +1641,27 @@ func TestRiskRangeSkewRatioWithinBucket(t *testing.T) {
// Search for the range from 2 to 3, since there is only one bucket it will be a query within
// a bucket.
testKit.MustExec("set @@session.tidb_opt_risk_range_skew_ratio = 0")
count, _, err := cardinality.GetRowCountByIndexRanges(sctx.GetPlanCtx(), &statsTbl.HistColl, idxID, getRange(2, 3), nil)
count, _, _, err := cardinality.GetRowCountByIndexRanges(sctx.GetPlanCtx(), &statsTbl.HistColl, idxID, getRange(2, 3), nil)
require.NoError(t, err)
testKit.MustExec("set @@session.tidb_opt_risk_range_skew_ratio = 0.5")
count2, _, err2 := cardinality.GetRowCountByIndexRanges(sctx.GetPlanCtx(), &statsTbl.HistColl, idxID, getRange(2, 3), nil)
count2, _, _, err2 := cardinality.GetRowCountByIndexRanges(sctx.GetPlanCtx(), &statsTbl.HistColl, idxID, getRange(2, 3), nil)
require.NoError(t, err2)
// Result of count2 should be larger than count because the risk ratio is higher
require.Less(t, count, count2)
testKit.MustExec("set @@session.tidb_opt_risk_range_skew_ratio = 1")
count3, _, err3 := cardinality.GetRowCountByIndexRanges(sctx.GetPlanCtx(), &statsTbl.HistColl, idxID, getRange(2, 3), nil)
count3, _, _, err3 := cardinality.GetRowCountByIndexRanges(sctx.GetPlanCtx(), &statsTbl.HistColl, idxID, getRange(2, 3), nil)
require.NoError(t, err3)
// Result of count3 should be larger because the risk ratio is higher
require.Less(t, count2, count3)
// Repeat the prior test by setting the global variable instead of the session variable. This should have no effect.
testKit.MustExec("set @@global.tidb_opt_risk_range_skew_ratio = 0.5")
count4, _, err4 := cardinality.GetRowCountByIndexRanges(sctx.GetPlanCtx(), &statsTbl.HistColl, idxID, getRange(2, 3), nil)
count4, _, _, err4 := cardinality.GetRowCountByIndexRanges(sctx.GetPlanCtx(), &statsTbl.HistColl, idxID, getRange(2, 3), nil)
require.NoError(t, err4)
require.Less(t, count2, count4)
// Repeat the prior test by setting the session variable to the default. Count4 should inherit the global
// variable and be less than count3.
testKit.MustExec("set @@session.tidb_opt_risk_range_skew_ratio = default")
count4, _, err4 = cardinality.GetRowCountByIndexRanges(sctx.GetPlanCtx(), &statsTbl.HistColl, idxID, getRange(2, 3), nil)
count4, _, _, err4 = cardinality.GetRowCountByIndexRanges(sctx.GetPlanCtx(), &statsTbl.HistColl, idxID, getRange(2, 3), nil)
require.NoError(t, err4)
require.Less(t, count4, count3)
// Reset global variable to default.
@ -1800,11 +1800,11 @@ func TestLastBucketEndValueHeuristic(t *testing.T) {
// Test index estimation as well
idx := statsTbl.GetIdx(table.Meta().Indices[0].ID)
if idx != nil {
idxEnhancedCount, _, err := cardinality.GetRowCountByIndexRanges(sctx.GetPlanCtx(), &statsTbl.HistColl, table.Meta().Indices[0].ID, getRange(11, 11), nil)
idxEnhancedCount, _, _, err := cardinality.GetRowCountByIndexRanges(sctx.GetPlanCtx(), &statsTbl.HistColl, table.Meta().Indices[0].ID, getRange(11, 11), nil)
require.NoError(t, err)
require.InDelta(t, 100.09, idxEnhancedCount, 0.1, "Index enhanced count should be approximately 100.09")
idxOtherCount, _, err := cardinality.GetRowCountByIndexRanges(sctx.GetPlanCtx(), &statsTbl.HistColl, table.Meta().Indices[0].ID, getRange(3, 3), nil)
idxOtherCount, _, _, err := cardinality.GetRowCountByIndexRanges(sctx.GetPlanCtx(), &statsTbl.HistColl, table.Meta().Indices[0].ID, getRange(3, 3), nil)
require.NoError(t, err)
require.InDelta(t, 109.99, idxOtherCount, 0.1, "Index other count should be approximately 109.99")
}

View File

@ -227,14 +227,15 @@ func stabilizeGetStatsTblInfo(info *getStatsTblInfo) {
*/
type accessPathForDebugTrace struct {
IndexName string `json:",omitempty"`
AccessConditions []string
IndexFilters []string
TableFilters []string
PartialPaths []accessPathForDebugTrace `json:",omitempty"`
CountAfterAccess float64
CorrCountAfterAccess float64
CountAfterIndex float64
IndexName string `json:",omitempty"`
AccessConditions []string
IndexFilters []string
TableFilters []string
PartialPaths []accessPathForDebugTrace `json:",omitempty"`
CountAfterAccess float64
MinCountAfterAccess float64
MaxCountAfterAccess float64
CountAfterIndex float64
}
func convertAccessPathForDebugTrace(ctx expression.EvalContext, path *util.AccessPath, out *accessPathForDebugTrace) {
@ -245,7 +246,8 @@ func convertAccessPathForDebugTrace(ctx expression.EvalContext, path *util.Acces
out.IndexFilters = expression.ExprsToStringsForDisplay(ctx, path.IndexFilters)
out.TableFilters = expression.ExprsToStringsForDisplay(ctx, path.TableFilters)
out.CountAfterAccess = path.CountAfterAccess
out.CorrCountAfterAccess = path.CorrCountAfterAccess
out.MaxCountAfterAccess = path.MaxCountAfterAccess
out.MinCountAfterAccess = path.MinCountAfterAccess
out.CountAfterIndex = path.CountAfterIndex
out.PartialPaths = make([]accessPathForDebugTrace, len(path.PartialIndexPaths))
for i, partialPath := range path.PartialIndexPaths {

View File

@ -1513,11 +1513,12 @@ func constructDS2IndexScanTask(
rowCount = math.Min(rowCount, 1.0)
}
tmpPath := &util.AccessPath{
IndexFilters: indexConds,
TableFilters: tblConds,
CountAfterIndex: rowCount,
CountAfterAccess: rowCount,
CorrCountAfterAccess: 0,
IndexFilters: indexConds,
TableFilters: tblConds,
CountAfterIndex: rowCount,
CountAfterAccess: rowCount,
MinCountAfterAccess: 0,
MaxCountAfterAccess: 0,
}
// Assume equal conditions used by index join and other conditions are independent.
if len(tblConds) > 0 {

View File

@ -1084,24 +1084,22 @@ func compareGlobalIndex(lhs, rhs *candidatePath) int {
return compareBool(lhs.path.Index.Global, rhs.path.Index.Global)
}
func compareCorrRatio(lhs, rhs *candidatePath) (int, float64) {
lhsCorrRatio, rhsCorrRatio := 0.0, 0.0
// CorrCountAfterAccess tracks the "CountAfterAccess" only including the most selective index column, thus
// lhs/rhsCorrRatio represents the "risk" of the CountAfterAccess value - lower value means less risk that
// we do NOT know about actual correlation between indexed columns
// TODO - corrCountAfterAccess is only currently used to compete 2 indexes - since they are the only paths
// that potentially go through expBackOffEstimation
if lhs.path.CorrCountAfterAccess > 0 && rhs.path.CorrCountAfterAccess > 0 {
lhsCorrRatio = lhs.path.CorrCountAfterAccess / lhs.path.CountAfterAccess
rhsCorrRatio = rhs.path.CorrCountAfterAccess / rhs.path.CountAfterAccess
func compareRiskRatio(lhs, rhs *candidatePath) (int, float64) {
lhsRiskRatio, rhsRiskRatio := 0.0, 0.0
// MaxCountAfterAccess tracks the worst case "CountAfterAccess", accounting for scenarios that could
// increase our row estimation, thus lhs/rhsRiskRatio represents the "risk" of the CountAfterAccess value.
// Lower value means less risk that the actual row count is higher than the estimated one.
if lhs.path.MaxCountAfterAccess > 0 && rhs.path.MaxCountAfterAccess > 0 {
lhsRiskRatio = lhs.path.MaxCountAfterAccess / lhs.path.CountAfterAccess
rhsRiskRatio = rhs.path.MaxCountAfterAccess / rhs.path.CountAfterAccess
}
// lhs has lower risk
if lhsCorrRatio < rhsCorrRatio && lhs.path.CountAfterAccess < rhs.path.CountAfterAccess {
return 1, lhsCorrRatio
if lhsRiskRatio < rhsRiskRatio && lhs.path.CountAfterAccess < rhs.path.CountAfterAccess {
return 1, lhsRiskRatio
}
// rhs has lower risk
if rhsCorrRatio < lhsCorrRatio && rhs.path.CountAfterAccess < lhs.path.CountAfterAccess {
return -1, rhsCorrRatio
if rhsRiskRatio < lhsRiskRatio && rhs.path.CountAfterAccess < lhs.path.CountAfterAccess {
return -1, rhsRiskRatio
}
return 0, 0
}
@ -1150,8 +1148,8 @@ func compareCandidates(sctx base.PlanContext, statsTbl *statistics.Table, tableI
matchResult, globalResult := compareBool(lhs.isMatchProp, rhs.isMatchProp), compareGlobalIndex(lhs, rhs)
accessResult, comparable1 := util.CompareCol2Len(lhs.accessCondsColMap, rhs.accessCondsColMap)
scanResult, comparable2 := compareIndexBack(lhs, rhs)
// TODO: corrResult is not added to sum to limit change to existing logic. Further testing required.
corrResult, _ := compareCorrRatio(lhs, rhs)
// TODO: riskResult is not added to sum to limit change to existing logic. Further testing required.
riskResult, _ := compareRiskRatio(lhs, rhs)
sum := accessResult + scanResult + matchResult + globalResult
// First rules apply when an index doesn't have statistics and another object (index or table) has statistics
@ -1159,11 +1157,13 @@ func compareCandidates(sctx base.PlanContext, statsTbl *statistics.Table, tableI
// If one index has statistics and the other does not, choose the index with statistics if it
// has the same or higher number of equal/IN predicates.
if !lhsPseudo && globalResult >= 0 && sum >= 0 &&
lhs.path.EqOrInCondCount > 0 && lhs.path.EqOrInCondCount >= rhs.path.EqOrInCondCount {
lhs.path.EqOrInCondCount > 0 && lhs.path.EqOrInCondCount >= rhs.path.EqOrInCondCount &&
(rhs.path.MaxCountAfterAccess <= 0 || lhs.path.CountAfterAccess < rhs.path.MaxCountAfterAccess) {
return 1, lhsPseudo // left wins and has statistics (lhsPseudo==false)
}
if !rhsPseudo && globalResult <= 0 && sum <= 0 &&
rhs.path.EqOrInCondCount > 0 && rhs.path.EqOrInCondCount >= lhs.path.EqOrInCondCount {
rhs.path.EqOrInCondCount > 0 && rhs.path.EqOrInCondCount >= lhs.path.EqOrInCondCount &&
(lhs.path.MaxCountAfterAccess <= 0 || rhs.path.CountAfterAccess < lhs.path.MaxCountAfterAccess) {
return -1, rhsPseudo // right wins and has statistics (rhsPseudo==false)
}
if preferRange {
@ -1191,10 +1191,10 @@ func compareCandidates(sctx base.PlanContext, statsTbl *statistics.Table, tableI
if threshold > 0 { // set it to 0 to disable this rule
// corrResult is included to ensure we don't preference to a higher risk plan given that
// this rule does not check the other criteria included below.
if lhs.path.CountAfterAccess/rhs.path.CountAfterAccess > threshold && corrResult <= 0 {
if lhs.path.CountAfterAccess/rhs.path.CountAfterAccess > threshold && riskResult <= 0 {
return -1, rhsPseudo // right wins - also return whether it has statistics (pseudo) or not
}
if rhs.path.CountAfterAccess/lhs.path.CountAfterAccess > threshold && corrResult >= 0 {
if rhs.path.CountAfterAccess/lhs.path.CountAfterAccess > threshold && riskResult >= 0 {
return 1, lhsPseudo // left wins - also return whether it has statistics (pseudo) or not
}
}

View File

@ -177,7 +177,8 @@ func fillIndexPath(ds *logicalop.DataSource, path *util.AccessPath, conds []expr
}
path.Ranges = ranger.FullRange()
path.CountAfterAccess = float64(ds.StatisticTable.RealtimeCount)
path.CorrCountAfterAccess = 0
path.MinCountAfterAccess = 0
path.MaxCountAfterAccess = 0
path.IdxCols, path.IdxColLens = expression.IndexInfo2PrefixCols(ds.Columns, ds.Schema().Columns, path.Index)
path.FullIdxCols, path.FullIdxColLens = expression.IndexInfo2Cols(ds.Columns, ds.Schema().Columns, path.Index)
if !path.Index.Unique && !path.Index.Primary && len(path.Index.Columns) == len(path.IdxCols) {
@ -415,10 +416,7 @@ func detachCondAndBuildRangeForPath(
if len(indexCols) > len(path.Index.Columns) { // remove clustered primary key if it has been added to path.IdxCols
indexCols = indexCols[0:len(path.Index.Columns)]
}
path.CountAfterAccess, path.CorrCountAfterAccess, err = cardinality.GetRowCountByIndexRanges(sctx, histColl, path.Index.ID, path.Ranges, indexCols)
if path.CorrCountAfterAccess == 0 {
path.CorrCountAfterAccess = path.CountAfterAccess
}
path.CountAfterAccess, path.MinCountAfterAccess, path.MaxCountAfterAccess, err = cardinality.GetRowCountByIndexRanges(sctx, histColl, path.Index.ID, path.Ranges, indexCols)
return err
}

View File

@ -42,12 +42,16 @@ type AccessPath struct {
// CountAfterAccess is the row count after we apply range seek and before we use other filter to filter data.
// For index merge path, CountAfterAccess is the row count after partial paths and before we apply table filters.
CountAfterAccess float64
// CorrCountAfterAccess is the row count after only applying the most filtering index columns.
// against the index. This is used when we don't have a full index statistics
// and we need to use the exponential backoff to estimate the row count.
// Case CorrCountAfterAccess > 0 : we use the exponential backoff to estimate the row count (such as we don't have a full index statistics)
// Default CorrCountAfterAccess = 0 : we use index of table estimate row coun directly (such as table full scan, point get etc)
CorrCountAfterAccess float64
// MinCountAfterAccess is a lower bound on CountAfterAccess, accounting for risks that could
// lead to overestimation, such as assuming correlation with exponential backoff when columns are actually independent.
// Case MinCountAfterAccess > 0 : we've encountered risky scenarios and have a potential lower row count estimation
// Default MinCountAfterAccess = 0 : we have not identified risks that could lead to lower row count
MinCountAfterAccess float64
// MaxCountAfterAccess is an upper bound on the CountAfterAccess, accounting for risks that could
// lead to underestimation, such as assuming independence between non-index columns.
// Case MaxCountAfterAccess > 0 : we've encountered risky scenarios and have a potential greater row count estimation
// Default MaxCountAfterAccess = 0 : we have not identified risks that could lead to greater row count
MaxCountAfterAccess float64
// CountAfterIndex is the row count after we apply filters on index and before we apply the table filters.
CountAfterIndex float64
AccessConds []expression.Expression
@ -139,7 +143,8 @@ func (path *AccessPath) Clone() *AccessPath {
ConstCols: slices.Clone(path.ConstCols),
Ranges: CloneRanges(path.Ranges),
CountAfterAccess: path.CountAfterAccess,
CorrCountAfterAccess: path.CorrCountAfterAccess,
MinCountAfterAccess: path.MinCountAfterAccess,
MaxCountAfterAccess: path.MaxCountAfterAccess,
CountAfterIndex: path.CountAfterIndex,
AccessConds: CloneExprs(path.AccessConds),
EqCondCount: path.EqCondCount,

View File

@ -393,51 +393,51 @@ func SubTestIndexRanges() func(*testing.T) {
HighVal: []types.Datum{types.MaxValueDatum()},
Collators: collate.GetBinaryCollatorSlice(1),
}}
count, _, err := GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran, nil)
count, _, _, err := GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran, nil)
require.NoError(t, err)
require.Equal(t, 99900, int(count))
ran[0].LowVal[0] = types.NewIntDatum(1000)
ran[0].HighVal[0] = types.NewIntDatum(2000)
count, _, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran, nil)
count, _, _, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran, nil)
require.NoError(t, err)
require.Equal(t, 2500, int(count))
ran[0].LowVal[0] = types.NewIntDatum(1001)
ran[0].HighVal[0] = types.NewIntDatum(1999)
count, _, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran, nil)
count, _, _, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran, nil)
require.NoError(t, err)
require.Equal(t, 2500, int(count))
ran[0].LowVal[0] = types.NewIntDatum(1000)
ran[0].HighVal[0] = types.NewIntDatum(1000)
count, _, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran, nil)
count, _, _, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran, nil)
require.NoError(t, err)
require.Equal(t, 100, int(count))
tbl.SetIdx(0, &Index{Info: &model.IndexInfo{Columns: []*model.IndexColumn{{Offset: 0}}, Unique: true}})
ran[0].LowVal[0] = types.NewIntDatum(1000)
ran[0].HighVal[0] = types.NewIntDatum(1000)
count, _, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran, nil)
count, _, _, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran, nil)
require.NoError(t, err)
require.Equal(t, 1, int(count))
tbl.SetIdx(0, idx)
ran[0].LowVal[0] = types.MinNotNullDatum()
ran[0].HighVal[0] = types.MaxValueDatum()
count, _, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran, nil)
count, _, _, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran, nil)
require.NoError(t, err)
require.Equal(t, 100000, int(count))
ran[0].LowVal[0] = types.NewIntDatum(1000)
ran[0].HighVal[0] = types.NewIntDatum(2000)
count, _, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran, nil)
count, _, _, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran, nil)
require.NoError(t, err)
require.Equal(t, 1000, int(count))
ran[0].LowVal[0] = types.NewIntDatum(1001)
ran[0].HighVal[0] = types.NewIntDatum(1990)
count, _, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran, nil)
count, _, _, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran, nil)
require.NoError(t, err)
require.Equal(t, 989, int(count))
ran[0].LowVal[0] = types.NewIntDatum(1000)
ran[0].HighVal[0] = types.NewIntDatum(1000)
count, _, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran, nil)
count, _, _, err = GetRowCountByIndexRanges(ctx, &tbl.HistColl, 0, ran, nil)
require.NoError(t, err)
require.Equal(t, 1, int(count))
}

View File

@ -50,7 +50,7 @@ var (
// Note: all functions below will be removed after finishing moving all estimation functions into the cardinality package.
// GetRowCountByIndexRanges is a function type to get row count by index ranges.
GetRowCountByIndexRanges func(sctx planctx.PlanContext, coll *HistColl, idxID int64, indexRanges []*ranger.Range, idxCol []*expression.Column) (result float64, corrResult float64, err error)
GetRowCountByIndexRanges func(sctx planctx.PlanContext, coll *HistColl, idxID int64, indexRanges []*ranger.Range, idxCol []*expression.Column) (result float64, minResult float64, maxResult float64, err error)
// GetRowCountByIntColumnRanges is a function type to get row count by int column ranges.
GetRowCountByIntColumnRanges func(sctx planctx.PlanContext, coll *HistColl, colID int64, intRanges []*ranger.Range) (result float64, err error)