子查询场景去除多余的distinct和group by子句

2024-02-22 20:48:11 +08:00
parent 9d4cf28d72
commit b9e819795f
5 changed files with 127 additions and 1 deletions
--- a/src/common/backend/utils/misc/guc/guc_sql.cpp
+++ b/src/common/backend/utils/misc/guc/guc_sql.cpp
@ -248,6 +248,7 @@ static const struct config_enum_entry rewrite_options[] = {
    {"predpushforce", PRED_PUSH_FORCE, false},
    {"disable_pullup_expr_sublink", SUBLINK_PULLUP_DISABLE_EXPR, false},
    {"enable_sublink_pullup_enhanced", SUBLINK_PULLUP_ENHANCED, false},
+    {"remove_redundant_distinct_group_by", REMOVE_REDUNDANT_DISTINCT_GROUP_BY, false},
    {NULL, 0, false}
 };

--- a/src/gausskernel/optimizer/plan/subselect.cpp
+++ b/src/gausskernel/optimizer/plan/subselect.cpp
@ -2032,6 +2032,34 @@ void SS_process_ctes(PlannerInfo* root)
    }
 }

+static void remove_redundant_distinct_group_by(Query* subselect)
+{
+    if (!ENABLE_REMOVE_REDUNDANT_DISTINCT_GROUP_BY()) {
+        return;
+    }
+
+    /*
+     * Remove distinct if distinct-on is not used and there is no limitClause.
+     */
+    if (subselect->distinctClause && !subselect->hasDistinctOn &&
+        subselect->limitCount == NULL && subselect->limitOffset == NULL) {
+        list_free_deep(subselect->distinctClause);
+        subselect->distinctClause = NULL;
+    }
+
+    /*
+     * Remove GROUP BY if there are no aggregate functions, HAVING clause, grouping
+     * sets and limitClause.
+     */
+
+    if (subselect->groupClause && !subselect->hasAggs && !subselect->hasWindowFuncs &&
+        subselect->havingQual == NULL && subselect->groupingSets == NULL &&
+        subselect->limitCount == NULL && subselect->limitOffset == NULL) {
+        list_free_deep(subselect->groupClause);
+        subselect->groupClause = NULL;
+    }
+}
+
 /*
 * convert_ANY_sublink_to_join: try to convert an ANY SubLink to a join
 *
@ -2111,6 +2139,8 @@ JoinExpr* convert_ANY_sublink_to_join(PlannerInfo* root, SubLink* sublink, bool
    if (contain_volatile_functions(sublink->testexpr))
        return NULL;

+    remove_redundant_distinct_group_by(subselect);
+
    /*
     * Okay, pull up the sub-select into upper range table.
     *
--- a/src/include/utils/guc.h
+++ b/src/include/utils/guc.h
@ -402,7 +402,8 @@ typedef enum {
    PRED_PUSH_NORMAL = (1 << 7),
    PRED_PUSH_FORCE = (1 << 8),
    SUBLINK_PULLUP_DISABLE_EXPR = (1 << 9), /* disable pull sublink in expr clause */
-    SUBLINK_PULLUP_ENHANCED = (1 << 10)
+    SUBLINK_PULLUP_ENHANCED = (1 << 10),
+    REMOVE_REDUNDANT_DISTINCT_GROUP_BY = (1 << 11)
 } rewrite_param;

 typedef enum {
@ -461,6 +462,9 @@ typedef enum {
 #define ENABLE_PRED_PUSH_ALL(root) \
    ((ENABLE_PRED_PUSH(root) || ENABLE_PRED_PUSH_NORMAL(root) || ENABLE_PRED_PUSH_FORCE(root)) && permit_predpush(root))

+#define ENABLE_REMOVE_REDUNDANT_DISTINCT_GROUP_BY() \
+    ((REMOVE_REDUNDANT_DISTINCT_GROUP_BY) & (uint)u_sess->attr.attr_sql.rewrite_rule)
+
 #define ENABLE_SQL_BETA_FEATURE(feature) \
    ((bool)((uint)u_sess->attr.attr_sql.sql_beta_feature & feature))

--- a/src/test/regress/expected/subselect_part2.out
+++ b/src/test/regress/expected/subselect_part2.out
@ -544,3 +544,74 @@ drop table tab_sa2;
 drop table tab_sa3;
 drop table tab_sa4;
 drop table tab_sa5;
+set rewrite_rule = 'remove_redundant_distinct_group_by';
+create table subselect_t1 (a int);
+create table subselect_t2 (a int, b int);
+insert into subselect_t1 values (1);
+insert into subselect_t1 values (2);
+insert into subselect_t1 values (-1);
+insert into subselect_t2 values (1, 1);
+insert into subselect_t2 values (2, 2);
+insert into subselect_t2 values (-1, -1);
+explain (costs off) select * from subselect_t1 where a in (select distinct a from subselect_t2);
+                   QUERY PLAN                   
+------------------------------------------------
+ Hash Join
+   Hash Cond: (subselect_t1.a = subselect_t2.a)
+   ->  Seq Scan on subselect_t1
+   ->  Hash
+         ->  HashAggregate
+               Group By Key: subselect_t2.a
+               ->  Seq Scan on subselect_t2
+(7 rows)
+
+select * from subselect_t1 where a in (select distinct a from subselect_t2);
+ a  
+----
+  1
+  2
+ -1
+(3 rows)
+
+explain (costs off) select * from subselect_t1 where a in (select a from subselect_t2 group by a);
+                   QUERY PLAN                   
+------------------------------------------------
+ Hash Join
+   Hash Cond: (subselect_t1.a = subselect_t2.a)
+   ->  Seq Scan on subselect_t1
+   ->  Hash
+         ->  HashAggregate
+               Group By Key: subselect_t2.a
+               ->  Seq Scan on subselect_t2
+(7 rows)
+
+select * from subselect_t1 where a in (select a from subselect_t2 group by a);
+ a  
+----
+  1
+  2
+ -1
+(3 rows)
+
+explain (costs off) select * from subselect_t1 where a in (select distinct on (abs(a)) a from subselect_t2);
+                        QUERY PLAN                         
+-----------------------------------------------------------
+ Hash Semi Join
+   Hash Cond: (subselect_t1.a = "ANY_subquery".a)
+   ->  Seq Scan on subselect_t1
+   ->  Hash
+         ->  Subquery Scan on "ANY_subquery"
+               ->  Unique
+                     ->  Sort
+                           Sort Key: (abs(subselect_t2.a))
+                           ->  Seq Scan on subselect_t2
+(9 rows)
+
+select * from subselect_t1 where a in (select distinct on (abs(a)) a from subselect_t2);
+ a 
+---
+ 1
+ 2
+(2 rows)
+
+reset rewrite_rule;
--- a/src/test/regress/sql/subselect_part2.sql
+++ b/src/test/regress/sql/subselect_part2.sql
@ -387,3 +387,23 @@ drop table tab_sa2;
 drop table tab_sa3;
 drop table tab_sa4;
 drop table tab_sa5;
+
+set rewrite_rule = 'remove_redundant_distinct_group_by';
+create table subselect_t1 (a int);
+create table subselect_t2 (a int, b int);
+insert into subselect_t1 values (1);
+insert into subselect_t1 values (2);
+insert into subselect_t1 values (-1);
+insert into subselect_t2 values (1, 1);
+insert into subselect_t2 values (2, 2);
+insert into subselect_t2 values (-1, -1);
+
+explain (costs off) select * from subselect_t1 where a in (select distinct a from subselect_t2);
+select * from subselect_t1 where a in (select distinct a from subselect_t2);
+
+explain (costs off) select * from subselect_t1 where a in (select a from subselect_t2 group by a);
+select * from subselect_t1 where a in (select a from subselect_t2 group by a);
+
+explain (costs off) select * from subselect_t1 where a in (select distinct on (abs(a)) a from subselect_t2);
+select * from subselect_t1 where a in (select distinct on (abs(a)) a from subselect_t2);
+reset rewrite_rule;