From a2d42842d2ebbcf00f56abe5ee776acdbd810ac5 Mon Sep 17 00:00:00 2001 From: Yuanjia Zhang Date: Fri, 31 Oct 2025 18:52:35 +0800 Subject: [PATCH] planner: set an lower-bound for NDV used in out-of-range estimation for EQ conditions when the Histogram is empty (#64139) close pingcap/tidb#64137 --- pkg/planner/cardinality/BUILD.bazel | 2 +- pkg/planner/cardinality/row_count_index.go | 13 ++++--- pkg/planner/cardinality/selectivity.go | 10 +++++- pkg/planner/cardinality/selectivity_test.go | 35 +++++++++++++++++-- .../cbotest/testdata/analyze_suite_out.json | 18 +++++----- .../cbotest/testdata/analyze_suite_xut.json | 18 +++++----- 6 files changed, 69 insertions(+), 27 deletions(-) diff --git a/pkg/planner/cardinality/BUILD.bazel b/pkg/planner/cardinality/BUILD.bazel index d5b300b7e9..f6aa0d1878 100644 --- a/pkg/planner/cardinality/BUILD.bazel +++ b/pkg/planner/cardinality/BUILD.bazel @@ -64,7 +64,7 @@ go_test( data = glob(["testdata/**"]), embed = [":cardinality"], flaky = True, - shard_count = 44, + shard_count = 45, deps = [ "//pkg/config", "//pkg/config/kerneltype", diff --git a/pkg/planner/cardinality/row_count_index.go b/pkg/planner/cardinality/row_count_index.go index b0ba8ef2c9..4d5ab875d8 100644 --- a/pkg/planner/cardinality/row_count_index.go +++ b/pkg/planner/cardinality/row_count_index.go @@ -429,8 +429,8 @@ func estimateRowCountWithUniformDistribution( increaseFactor := stats.GetIncreaseFactor(realtimeRowCount) notNullCount := histogram.NotNullCount() - // Branch 1: all NDV's are in TopN, and no histograms. - if histNDV <= 0 || notNullCount == 0 { + var avgRowEstimate float64 + if histNDV <= 0 || notNullCount == 0 { // Branch 1: all NDV's are in TopN, and no histograms. // We have no histograms, but c.Histogram.NDV > c.TopN.Num(). // This can happen when sampling collects fewer than all NDV. if histNDV > 0 && modifyCount == 0 { @@ -441,12 +441,11 @@ func estimateRowCountWithUniformDistribution( if notNullCount <= 0 { notNullCount = totalRowCount - float64(histogram.NullCount) } - outOfRangeCnt := outOfRangeFullNDV(float64(histogram.NDV), totalRowCount, notNullCount, float64(realtimeRowCount), increaseFactor, modifyCount) - return statistics.DefaultRowEst(outOfRangeCnt) + avgRowEstimate = outOfRangeFullNDV(float64(histogram.NDV), totalRowCount, notNullCount, float64(realtimeRowCount), increaseFactor, modifyCount) + } else { // Branch 2: some NDV's are in histograms + // Calculate the average histogram rows (which excludes topN) and NDV that excluded topN + avgRowEstimate = notNullCount / histNDV } - // branch 2: some NDV's are in histograms - // Calculate the average histogram rows (which excludes topN) and NDV that excluded topN - avgRowEstimate := notNullCount / histNDV // skewRatio determines how much of the potential skew should be considered skewRatio := sctx.GetSessionVars().RiskEqSkewRatio diff --git a/pkg/planner/cardinality/selectivity.go b/pkg/planner/cardinality/selectivity.go index fb140595eb..d0e2345162 100644 --- a/pkg/planner/cardinality/selectivity.go +++ b/pkg/planner/cardinality/selectivity.go @@ -1173,6 +1173,7 @@ func outOfRangeEQSelectivity(sctx planctx.PlanContext, ndv, realtimeRowCount, co // outOfRangeFullNDV estimates the number of qualified rows when the topN represents all NDV values // and the searched value does not appear in the topN func outOfRangeFullNDV(ndv, origRowCount, notNullCount, realtimeRowCount, increaseFactor float64, modifyCount int64) (result float64) { + // TODO: align or merge this out-of-range-est methods with `Histogram.OutOfRangeRowCount`. // If the table hasn't been modified, it's safe to return 0. if modifyCount == 0 { return 0 @@ -1189,7 +1190,7 @@ func outOfRangeFullNDV(ndv, origRowCount, notNullCount, realtimeRowCount, increa if newRows < 0 { newRows = min(notNullCount, realtimeRowCount) } - // if no NDV - derive an NDV using sqrt + // if no NDV - derive an NDV using sqrt, this could happen for unanalyzed tables if ndv <= 0 { ndv = math.Sqrt(max(notNullCount, realtimeRowCount)) } else { @@ -1197,6 +1198,13 @@ func outOfRangeFullNDV(ndv, origRowCount, notNullCount, realtimeRowCount, increa // the caller of the function ndv *= increaseFactor } + // If topN represents all NDV values, the NDV should be relatively small. + // Small NDV could cause extremely inaccurate result, use `outOfRangeBetweenRate` to smooth the result. + // For example, TopN = {(value:1, rows: 10000), (2, 10000), (3, 10000)} and newRows = 15000, we should assume most + // newly added rows are 1, 2 or 3. Then for an out-of-range estimation like `where col=9999`, the result should be + // close to 0, but if we still use the original NDV, the result could be extremely large: 15000/3 = 5000. + // See #64137 for a concrete example. + ndv = max(ndv, float64(outOfRangeBetweenRate)) // avoid inaccurate estimate caused by small NDV return max(1, newRows/ndv) } diff --git a/pkg/planner/cardinality/selectivity_test.go b/pkg/planner/cardinality/selectivity_test.go index 08381f43cc..ddea56708b 100644 --- a/pkg/planner/cardinality/selectivity_test.go +++ b/pkg/planner/cardinality/selectivity_test.go @@ -574,8 +574,8 @@ func TestEstimationForUnknownValuesAfterModify(t *testing.T) { countEst, err = cardinality.GetColumnRowCount(sctx, col, getRange(15, 15), statsTblNew.RealtimeCount, statsTblNew.ModifyCount, false) count = countEst.Est require.NoError(t, err) - require.Truef(t, count < 40, "expected: between 20 to 40, got: %v", count) - require.Truef(t, count > 20, "expected: between 20 to 40, got: %v", count) + require.Truef(t, count < 40, "expected: between 10 to 40, got: %v", count) + require.Truef(t, count > 10, "expected: between 10 to 40, got: %v", count) } func TestNewIndexWithoutStats(t *testing.T) { @@ -2094,6 +2094,37 @@ func TestLastBucketEndValueHeuristic(t *testing.T) { } } +func TestIssue64137(t *testing.T) { + store, dom := testkit.CreateMockStoreAndDomain(t) + h := dom.StatsHandle() + tk := testkit.NewTestKit(t, store) + tk.MustExec(`use test`) + tk.MustExec(`create table t (a int, key(a))`) + tk.MustExec(`set @@cte_max_recursion_depth=10000`) + tk.MustExec(`insert into t select * from (with recursive cte as ( + select 1 as a, 1 as num union all + select 1 as a, num+1 as num from cte where num < 10000 + ) select a from cte) tt;`) // insert 10000 rows with a=1 + require.NoError(t, h.DumpStatsDeltaToKV(true)) + tk.MustQuery(`select count(1) from t`).Check(testkit.Rows("10000")) + tk.MustExec(`analyze table t`) + tk.MustQuery(`show stats_topn where is_index=1`).Check(testkit.Rows("test t a 1 1 10000")) // 1 topN value with count 10000 + + tk.MustExec(`insert into t select * from t limit 2000`) // insert 2000 rows with a=1 + require.NoError(t, h.DumpStatsDeltaToKV(true)) + h.Update(context.Background(), dom.InfoSchema()) + statsMeta := tk.MustQuery(`show stats_meta`).Rows()[0] + require.Equal(t, statsMeta[4], "2000") // modify_count = 2000 + require.Equal(t, statsMeta[5], "12000") // row_count = 10000+2000 + + tk.MustQuery(`explain select * from t where a=99999999`).Check(testkit.Rows( + `IndexReader_7 24.00 root index:IndexRangeScan_6`, // out-of-range est for small NDV, result should close to zero + `└─IndexRangeScan_6 24.00 cop[tikv] table:t, index:a(a) range:[99999999,99999999], keep order:false`)) + tk.MustQuery(`explain select * from t where a=1`).Check(testkit.Rows( + `IndexReader_7 12000.00 root index:IndexRangeScan_6`, // in-range est for small NDV + `└─IndexRangeScan_6 12000.00 cop[tikv] table:t, index:a(a) range:[1,1], keep order:false`)) +} + func TestUninitializedStats(t *testing.T) { store, _ := testkit.CreateMockStoreAndDomain(t) tk := testkit.NewTestKit(t, store) diff --git a/pkg/planner/core/casetest/cbotest/testdata/analyze_suite_out.json b/pkg/planner/core/casetest/cbotest/testdata/analyze_suite_out.json index 145692a7f1..5bca91072a 100644 --- a/pkg/planner/core/casetest/cbotest/testdata/analyze_suite_out.json +++ b/pkg/planner/core/casetest/cbotest/testdata/analyze_suite_out.json @@ -526,12 +526,14 @@ "SQL": "EXPLAIN format = 'verbose' SELECT * FROM `tbl_cardcore_transaction` `transactio0_` WHERE `transactio0_`.`period` = '202502' AND `transactio0_`.`account_number` = '1901040107462200' ORDER BY `transactio0_`.`transaction_status`, `transactio0_`.`account_number`, `transactio0_`.`entry_date` ASC, `transactio0_`.`id` ASC;", "Plan": [ "Sort_5 1.00 39640.65 root cardcore_issuing.tbl_cardcore_transaction.transaction_status, cardcore_issuing.tbl_cardcore_transaction.account_number, cardcore_issuing.tbl_cardcore_transaction.entry_date, cardcore_issuing.tbl_cardcore_transaction.id", - "└─IndexLookUp_15 1.00 39619.45 root ", - " ├─IndexRangeScan_12(Build) 16.16 4094.52 cop[tikv] table:transactio0_, index:tbl_cardcore_transaction_ix10(account_number, entry_date, value_date) range:[\"1901040107462200\",\"1901040107462200\"], keep order:false", - " └─Selection_14(Probe) 1.00 5431.27 cop[tikv] eq(cardcore_issuing.tbl_cardcore_transaction.period, \"202502\")", - " └─TableRowIDScan_13 16.16 4624.68 cop[tikv] table:transactio0_ keep order:false" + "└─IndexLookUp_12 1.00 39619.45 root ", + " ├─IndexRangeScan_9(Build) 16.16 4094.52 cop[tikv] table:transactio0_, index:tbl_cardcore_transaction_ix10(account_number, entry_date, value_date) range:[\"1901040107462200\",\"1901040107462200\"], keep order:false", + " └─Selection_11(Probe) 1.00 5431.27 cop[tikv] eq(cardcore_issuing.tbl_cardcore_transaction.period, \"202502\")", + " └─TableRowIDScan_10 16.16 4624.68 cop[tikv] table:transactio0_ keep order:false" ], - "Warn": null + "Warn": [ + "Note 1105 [tbl_cardcore_transaction_ix10,tbl_cardcore_transaction_ix17] remain after pruning paths for transactio0_ given Prop{SortItems: [], TaskTp: rootTask}" + ] } ] }, @@ -576,9 +578,9 @@ { "SQL": "explain select * from tbl_cardcore_statement s where s.latest_stmt_print_date = '2024-10-16';", "Plan": [ - "IndexLookUp_11 169960.89 root ", - "├─IndexRangeScan_9(Build) 169960.89 cop[tikv] table:s, index:tbl_cardcore_statement_ix7(latest_stmt_print_date) range:[2024-10-16,2024-10-16], keep order:false", - "└─TableRowIDScan_10(Probe) 169960.89 cop[tikv] table:s keep order:false" + "IndexLookUp_11 53778.89 root ", + "├─IndexRangeScan_9(Build) 53778.89 cop[tikv] table:s, index:tbl_cardcore_statement_ix7(latest_stmt_print_date) range:[2024-10-16,2024-10-16], keep order:false", + "└─TableRowIDScan_10(Probe) 53778.89 cop[tikv] table:s keep order:false" ], "Warn": null } diff --git a/pkg/planner/core/casetest/cbotest/testdata/analyze_suite_xut.json b/pkg/planner/core/casetest/cbotest/testdata/analyze_suite_xut.json index 145692a7f1..5bca91072a 100644 --- a/pkg/planner/core/casetest/cbotest/testdata/analyze_suite_xut.json +++ b/pkg/planner/core/casetest/cbotest/testdata/analyze_suite_xut.json @@ -526,12 +526,14 @@ "SQL": "EXPLAIN format = 'verbose' SELECT * FROM `tbl_cardcore_transaction` `transactio0_` WHERE `transactio0_`.`period` = '202502' AND `transactio0_`.`account_number` = '1901040107462200' ORDER BY `transactio0_`.`transaction_status`, `transactio0_`.`account_number`, `transactio0_`.`entry_date` ASC, `transactio0_`.`id` ASC;", "Plan": [ "Sort_5 1.00 39640.65 root cardcore_issuing.tbl_cardcore_transaction.transaction_status, cardcore_issuing.tbl_cardcore_transaction.account_number, cardcore_issuing.tbl_cardcore_transaction.entry_date, cardcore_issuing.tbl_cardcore_transaction.id", - "└─IndexLookUp_15 1.00 39619.45 root ", - " ├─IndexRangeScan_12(Build) 16.16 4094.52 cop[tikv] table:transactio0_, index:tbl_cardcore_transaction_ix10(account_number, entry_date, value_date) range:[\"1901040107462200\",\"1901040107462200\"], keep order:false", - " └─Selection_14(Probe) 1.00 5431.27 cop[tikv] eq(cardcore_issuing.tbl_cardcore_transaction.period, \"202502\")", - " └─TableRowIDScan_13 16.16 4624.68 cop[tikv] table:transactio0_ keep order:false" + "└─IndexLookUp_12 1.00 39619.45 root ", + " ├─IndexRangeScan_9(Build) 16.16 4094.52 cop[tikv] table:transactio0_, index:tbl_cardcore_transaction_ix10(account_number, entry_date, value_date) range:[\"1901040107462200\",\"1901040107462200\"], keep order:false", + " └─Selection_11(Probe) 1.00 5431.27 cop[tikv] eq(cardcore_issuing.tbl_cardcore_transaction.period, \"202502\")", + " └─TableRowIDScan_10 16.16 4624.68 cop[tikv] table:transactio0_ keep order:false" ], - "Warn": null + "Warn": [ + "Note 1105 [tbl_cardcore_transaction_ix10,tbl_cardcore_transaction_ix17] remain after pruning paths for transactio0_ given Prop{SortItems: [], TaskTp: rootTask}" + ] } ] }, @@ -576,9 +578,9 @@ { "SQL": "explain select * from tbl_cardcore_statement s where s.latest_stmt_print_date = '2024-10-16';", "Plan": [ - "IndexLookUp_11 169960.89 root ", - "├─IndexRangeScan_9(Build) 169960.89 cop[tikv] table:s, index:tbl_cardcore_statement_ix7(latest_stmt_print_date) range:[2024-10-16,2024-10-16], keep order:false", - "└─TableRowIDScan_10(Probe) 169960.89 cop[tikv] table:s keep order:false" + "IndexLookUp_11 53778.89 root ", + "├─IndexRangeScan_9(Build) 53778.89 cop[tikv] table:s, index:tbl_cardcore_statement_ix7(latest_stmt_print_date) range:[2024-10-16,2024-10-16], keep order:false", + "└─TableRowIDScan_10(Probe) 53778.89 cop[tikv] table:s keep order:false" ], "Warn": null }