!6047 解决samplescan行数估算差距大，执行计划选择错误导致的性能问题

Merge pull request !6047 from laishenghao/samplescan
2024-08-15 09:27:18 +00:00
parent f4232f8055 84f04a56d1
commit bae9b63a3b
3 changed files with 107 additions and 29 deletions
--- a/src/gausskernel/optimizer/path/costsize.cpp
+++ b/src/gausskernel/optimizer/path/costsize.cpp
@ -6230,14 +6230,14 @@ void finalize_dml_cost(ModifyTable* plan)
 * Parameters:
 *	@in pctnode: node of percent args.
 *
- * Return: float4
+ * Return: float8
 */
-static float4 get_samplefract(Node* pctnode)
+static float8 get_samplefract(Node* pctnode)
 {
-    float4 samplefract;
+    float8 samplefract;

    if (IsA(pctnode, Const) && !((Const*)pctnode)->constisnull) {
-        samplefract = DatumGetFloat4(((Const*)pctnode)->constvalue);
+        samplefract = DatumGetFloat8(((Const*)pctnode)->constvalue);
        if (samplefract >= 0.0 && samplefract <= 100.0 && !isnan(samplefract)) {
            samplefract /= 100.0f;
        } else {
@ -6265,7 +6265,7 @@ static float4 get_samplefract(Node* pctnode)
 void system_samplescangetsamplesize(PlannerInfo* root, RelOptInfo* baserel, List* paramexprs)
 {
    Node* pctnode = NULL;
-    float4 samplefract;
+    float8 samplefract;

    /* Try to extract an estimate for the sample percentage */
    pctnode = (Node*)linitial(paramexprs);
@ -6292,7 +6292,7 @@ void system_samplescangetsamplesize(PlannerInfo* root, RelOptInfo* baserel, List
 void bernoulli_samplescangetsamplesize(PlannerInfo* root, RelOptInfo* baserel, List* paramexprs)
 {
    Node* pctnode = NULL;
-    float4 samplefract;
+    float8 samplefract;

    /* Try to extract an estimate for the sample percentage */
    pctnode = (Node*)linitial(paramexprs);
@ -6325,7 +6325,7 @@ void hybrid_samplescangetsamplesize(PlannerInfo* root, RelOptInfo* baserel, List
    foreach (lc, paramexprs) {
        Node* paramnode = (Node*)lfirst(lc);
        Node* pctnode = estimate_expression_value(root, paramnode);
-        float4 samplefract = 0.0;
+        float8 samplefract = 0.0;
        if (likely(pctnode)) {
            samplefract = get_samplefract(pctnode);
        } else {
--- a/src/test/regress/expected/tablesample_3.out
+++ b/src/test/regress/expected/tablesample_3.out
@ -20,15 +20,16 @@ select count(*) from ((select * from test_tablesample tablesample SYSTEM(20) REP
 (1 row)

 explain (costs off) select * from test_tablesample tablesample BERNOULLI(50) REPEATABLE (200) left join test_tablesample2 tablesample BERNOULLI(50) REPEATABLE (200) on test_tablesample.id=test_tablesample2.id where test_tablesample2.id is NULL;
-                                      QUERY PLAN                                       
---------------------------------------------------------------------------------------
- Nested Loop Anti Join
-   Join Filter: (test_tablesample.id = test_tablesample2.id)
+                                         QUERY PLAN                                          
+---------------------------------------------------------------------------------------------
+ Hash Anti Join
+   Hash Cond: (test_tablesample.id = test_tablesample2.id)
   ->  Sample Scan on test_tablesample
         Sampling: bernoulli (50::double precision) REPEATABLE (200::double precision)
-   ->  Sample Scan on test_tablesample2
-         Sampling: bernoulli (50::double precision) REPEATABLE (200::double precision)
-(6 rows)
+   ->  Hash
+         ->  Sample Scan on test_tablesample2
+               Sampling: bernoulli (50::double precision) REPEATABLE (200::double precision)
+(7 rows)

 select * from test_tablesample tablesample BERNOULLI(50) REPEATABLE (200) left join test_tablesample2 tablesample BERNOULLI(50) REPEATABLE (200) on test_tablesample.id=test_tablesample2.id where test_tablesample2.id is NULL;
 id | name | salary | id | name | salary 
@ -37,15 +38,19 @@ select * from test_tablesample tablesample BERNOULLI(50) REPEATABLE (200) left j

 set enable_hashjoin to off;
 explain (costs off) select * from test_tablesample tablesample BERNOULLI(50) REPEATABLE (200) left join test_tablesample2 tablesample BERNOULLI(50) REPEATABLE (200) on test_tablesample.id=test_tablesample2.id where test_tablesample2.id is NULL;
-                                      QUERY PLAN                                       
---------------------------------------------------------------------------------------
- Nested Loop Anti Join
-   Join Filter: (test_tablesample.id = test_tablesample2.id)
-   ->  Sample Scan on test_tablesample
-         Sampling: bernoulli (50::double precision) REPEATABLE (200::double precision)
-   ->  Sample Scan on test_tablesample2
-         Sampling: bernoulli (50::double precision) REPEATABLE (200::double precision)
-(6 rows)
+                                         QUERY PLAN                                          
+---------------------------------------------------------------------------------------------
+ Merge Anti Join
+   Merge Cond: (test_tablesample.id = test_tablesample2.id)
+   ->  Sort
+         Sort Key: test_tablesample.id
+         ->  Sample Scan on test_tablesample
+               Sampling: bernoulli (50::double precision) REPEATABLE (200::double precision)
+   ->  Sort
+         Sort Key: test_tablesample2.id
+         ->  Sample Scan on test_tablesample2
+               Sampling: bernoulli (50::double precision) REPEATABLE (200::double precision)
+(10 rows)

 select * from test_tablesample tablesample BERNOULLI(50) REPEATABLE (200) left join test_tablesample2 tablesample BERNOULLI(50) REPEATABLE (200) on test_tablesample.id=test_tablesample2.id where test_tablesample2.id is NULL;
 id | name | salary | id | name | salary 
@ -54,15 +59,16 @@ select * from test_tablesample tablesample BERNOULLI(50) REPEATABLE (200) left j

 set enable_mergejoin to off;
 explain (costs off) select * from test_tablesample tablesample BERNOULLI(50) REPEATABLE (200) left join test_tablesample2 tablesample BERNOULLI(50) REPEATABLE (200) on test_tablesample.id=test_tablesample2.id where test_tablesample2.id is NULL;
-                                      QUERY PLAN                                       
---------------------------------------------------------------------------------------
+                                         QUERY PLAN                                          
+---------------------------------------------------------------------------------------------
 Nested Loop Anti Join
   Join Filter: (test_tablesample.id = test_tablesample2.id)
   ->  Sample Scan on test_tablesample
         Sampling: bernoulli (50::double precision) REPEATABLE (200::double precision)
-   ->  Sample Scan on test_tablesample2
-         Sampling: bernoulli (50::double precision) REPEATABLE (200::double precision)
-(6 rows)
+   ->  Materialize
+         ->  Sample Scan on test_tablesample2
+               Sampling: bernoulli (50::double precision) REPEATABLE (200::double precision)
+(7 rows)

 select * from test_tablesample tablesample BERNOULLI(50) REPEATABLE (200) left join test_tablesample2 tablesample BERNOULLI(50) REPEATABLE (200) on test_tablesample.id=test_tablesample2.id where test_tablesample2.id is NULL;
 id | name | salary | id | name | salary 
@ -315,8 +321,61 @@ select * from test_tablesample tablesample SYSTEM(50) REPEATABLE (200) left join
 ----+------+--------+----+------+--------
 (0 rows)

+-- test rows estimation of samplescan
+set enable_hashjoin to on;
+create table ss_rows_t1 (a int);
+create table ss_rows_t2 (b int);
+create table ss_rows_t3 (c int);
+insert into ss_rows_t1 values (generate_series(1, 20000));
+insert into ss_rows_t2 values (generate_series(1, 20000));
+insert into ss_rows_t3 values (generate_series(1, 20000));
+explain select a from  ss_rows_t1 tablesample system (99.999999);
+--?.*
+--?.*
+--? Sample Scan on ss_rows_t1  (cost=.* rows=\d{5} width=4)
+   Sampling: system (99.999999::double precision)
+(2 rows)
+
+explain select b from  ss_rows_t2 tablesample bernoulli (99.999999);
+--?.*
+--?.*
+--? Sample Scan on ss_rows_t2  (cost=.* rows=\d{5} width=4)
+   Sampling: bernoulli (99.999999::double precision)
+(2 rows)
+
+explain select c from  ss_rows_t3 tablesample hybrid (99.999999,99.999999);
+--?.*
+--?.*
+--? Sample Scan on ss_rows_t3  (cost=.* rows=\d{5} width=4)
+   Sampling: hybrid (99.999999::double precision, 99.999999::double precision)
+(2 rows)
+
+explain select a, b, c from
+    ss_rows_t1 tablesample system (99.999999) repeatable (325),
+    ss_rows_t2 tablesample bernoulli (99.999999) repeatable (0),
+    ss_rows_t3 tablesample hybrid (99.999999,99.999999) repeatable (510)
+    where a = b and b = c;
+--?.*
+--?.*
+--? Hash Join  (.* width=12)
+   Hash Cond: (ss_rows_t1.a = ss_rows_t3.c)
+--?   ->  Hash Join  (.* width=8)
+         Hash Cond: (ss_rows_t1.a = ss_rows_t2.b)
+--?         ->  Sample Scan on ss_rows_t1  (cost=.* rows=\d{5} width=4)
+               Sampling: system (99.999999::double precision) REPEATABLE (325::double precision)
+--?         ->  Hash  (cost=.* rows=\d{5} width=4)
+--?               ->  Sample Scan on ss_rows_t2  (cost=.* rows=\d{5} width=4)
+                     Sampling: bernoulli (99.999999::double precision) REPEATABLE (0::double precision)
+--?   ->  Hash  (cost=.* rows=\d{5} width=4)
+--?         ->  Sample Scan on ss_rows_t3  (cost=.* rows=\d{5} width=4)
+               Sampling: hybrid (99.999999::double precision, 99.999999::double precision) REPEATABLE (510::double precision)
+(12 rows)
+
 reset search_path;
 drop schema  tablesample_schema4 cascade;
-NOTICE:  drop cascades to 2 other objects
+NOTICE:  drop cascades to 5 other objects
 DETAIL:  drop cascades to table tablesample_schema4.test_tablesample
 drop cascades to table tablesample_schema4.test_tablesample2
+drop cascades to table tablesample_schema4.ss_rows_t1
+drop cascades to table tablesample_schema4.ss_rows_t2
+drop cascades to table tablesample_schema4.ss_rows_t3
--- a/src/test/regress/sql/tablesample_3.sql
+++ b/src/test/regress/sql/tablesample_3.sql
@ -95,5 +95,24 @@ select * from test_tablesample tablesample BERNOULLI(50) REPEATABLE (200) left j
 explain (costs off) select * from test_tablesample tablesample SYSTEM(50) REPEATABLE (200) left join test_tablesample2 tablesample SYSTEM(50) REPEATABLE (200) on test_tablesample.id=test_tablesample2.id where test_tablesample2.id is NULL;
 select * from test_tablesample tablesample SYSTEM(50) REPEATABLE (200) left join test_tablesample2 tablesample SYSTEM(50) REPEATABLE (200) on test_tablesample.id=test_tablesample2.id where test_tablesample2.id is NULL;

+-- test rows estimation of samplescan
+set enable_hashjoin to on;
+create table ss_rows_t1 (a int);
+create table ss_rows_t2 (b int);
+create table ss_rows_t3 (c int);
+insert into ss_rows_t1 values (generate_series(1, 20000));
+insert into ss_rows_t2 values (generate_series(1, 20000));
+insert into ss_rows_t3 values (generate_series(1, 20000));
+
+explain select a from  ss_rows_t1 tablesample system (99.999999);
+explain select b from  ss_rows_t2 tablesample bernoulli (99.999999);
+explain select c from  ss_rows_t3 tablesample hybrid (99.999999,99.999999);
+
+explain select a, b, c from
+    ss_rows_t1 tablesample system (99.999999) repeatable (325),
+    ss_rows_t2 tablesample bernoulli (99.999999) repeatable (0),
+    ss_rows_t3 tablesample hybrid (99.999999,99.999999) repeatable (510)
+    where a = b and b = c;
+
 reset search_path;
 drop schema  tablesample_schema4 cascade;