From be94cc71b5709bd46912698c8c208bea0d93aef5 Mon Sep 17 00:00:00 2001
From: luozihao <1165977584@qq.com>
Date: Tue, 30 Apr 2024 11:53:18 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E5=9C=A8=E5=B1=82=E6=AC=A1?=
 =?UTF-8?q?=E6=9F=A5=E8=AF=A2=E4=B8=AD=E5=B0=86where=E6=9D=A1=E4=BB=B6?=
 =?UTF-8?q?=E5=88=86=E5=89=B2=E4=B8=BAjoin=E5=92=8C=E9=9D=9Ejoin=EF=BC=8C?=
 =?UTF-8?q?=E5=B9=B6=E5=B0=86join=E6=9D=A1=E4=BB=B6=E4=B8=8B=E6=8E=A8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/common/backend/parser/parse_startwith.cpp | 106 +++++++++++++++---
 src/test/regress/expected/sw_bugfix-1.out     |  29 ++++-
 src/test/regress/expected/sw_bugfix-2.out     |  67 +++++++++++
 src/test/regress/expected/sw_icbc.out         |   6 +-
 src/test/regress/sql/sw_bugfix-1.sql          |  21 ++++
 src/test/regress/sql/sw_bugfix-2.sql          |  24 +++-
 6 files changed, 228 insertions(+), 25 deletions(-)

diff --git a/src/common/backend/parser/parse_startwith.cpp b/src/common/backend/parser/parse_startwith.cpp
index 865761b03..e50e520ce 100644
--- a/src/common/backend/parser/parse_startwith.cpp
+++ b/src/common/backend/parser/parse_startwith.cpp
@@ -273,6 +273,7 @@ void transformStartWith(ParseState *pstate, SelectStmt *stmt, Query *qry)
         transformSingleRTE(pstate, qry, &context, (Node *)stmt->startWithClause);
     } else {
         transformFromList(pstate, qry, &context, (Node *)stmt->startWithClause);
+        stmt->whereClause = context.whereClause;
     }
 
     return;
@@ -1524,6 +1525,82 @@ static bool count_columnref_walker(Node *node, int *columnref_count)
     return false;
 }
 
+/*
+ * split_where_expr_by_join
+ *
+ * According to the logic of Hierarchical Queries processing, start with operation will be as follows
+ * 1. Evaluating JOIN expr in where clause if there any(should only be muti-table join).
+ * 2. Evaluating START WITH/CONNECT BY condition.
+ * 3. Evaluating remaining where clause a.k.a NON-JOIN quals if there any.
+ *
+ * Based on that, we have to push down the join-qual for start-with/connect-by quals, but non-join quals save for later
+ * filtering right after start with operation is done.
+ * Note that, for OR expr, if we have to replace the sub_expr, we have to set it as FALSE, other case will be TRUE.
+ *
+ * For expr = (A AND (B OR C)) OR (D AND E) AND (F OR G) OR (H OR I), and we assume C, E, F, I is non_join qual,
+ *
+ * so the remaining JOIN expr will be
+ *             (A AND B) OR (D) AND (G) OR H
+ * NON-JOIN expr will be
+ *             (C) OR (E) AND (F) OR (I)
+ */
+static void split_where_expr_by_join(Node **p_expr_join, Node **p_expr_non_join)
+{
+    if (p_expr_join == NULL || (p_expr_join != NULL && *p_expr_join == NULL)) {
+        return;
+    }
+ 
+    Node *expr_join = *p_expr_join;
+    Node *expr_non_join = *p_expr_non_join;
+ 
+    if (IsA(expr_join, A_Expr)) {
+        A_Expr *a_expr_join = (A_Expr *)expr_join;
+        A_Expr *a_expr_non_join = (A_Expr *)expr_non_join;
+ 
+        if (a_expr_join->kind == AEXPR_AND || a_expr_join->kind == AEXPR_OR) {
+            if (a_expr_join->lexpr != NULL) {
+                split_where_expr_by_join(&(a_expr_join->lexpr), &(a_expr_non_join->lexpr));
+            }
+            if (a_expr_join->rexpr != NULL) {
+                split_where_expr_by_join(&(a_expr_join->rexpr), &(a_expr_non_join->rexpr));
+            }
+            if (a_expr_join->lexpr == NULL) {
+                *p_expr_join = a_expr_join->rexpr;
+            } else if (a_expr_join->rexpr == NULL) {
+                *p_expr_join = a_expr_join->lexpr;
+            }
+ 
+            if (a_expr_non_join->lexpr == NULL) {
+                *p_expr_non_join = a_expr_non_join->rexpr;
+            } else if (a_expr_non_join->rexpr == NULL) {
+                *p_expr_non_join = a_expr_non_join->lexpr;
+            }
+        } else if (a_expr_join->kind == AEXPR_OP) {
+            int l_cref_count = 0;
+            int r_cref_count = 0;
+            count_columnref_walker(a_expr_join->lexpr, &l_cref_count);
+            count_columnref_walker(a_expr_join->rexpr, &r_cref_count);
+ 
+            /* if there're columnRefs at both sides meaning its a join-qual, otherwise, non-join-qual */
+            if (l_cref_count != 0 && r_cref_count != 0) {
+                pfree_ext(expr_non_join);
+                *p_expr_non_join = NULL;
+            } else {
+                pfree_ext(expr_join);
+                *p_expr_join = NULL;
+            }
+        } else {
+            /* must not be a join qual for non AEXPR_OP type */
+            pfree_ext(expr_join);
+            *p_expr_join = NULL;
+        }
+    } else {
+        /* must not be a join qual for non-a_expr type */
+        pfree_ext(expr_join);
+        *p_expr_join = NULL;
+    }
+}
+
 static bool walker_to_exclude_non_join_quals(Node *node, Node *context_node)
 {
     if (node == NULL) {
@@ -1616,17 +1693,12 @@ static SelectStmt *CreateStartWithCTEInnerBranch(ParseState* pstate,
 
         if (whereClause != NULL) {
             JoinExpr *final_join = (JoinExpr *)origin_table;
-            /* pushdown requires deep copying of the quals */
-            Node *whereCopy = (Node *)copyObject(whereClause);
-            /* only join quals can be pushed down */
-            raw_expression_tree_walker((Node *)whereCopy,
-                (bool (*)())walker_to_exclude_non_join_quals, (void*)NULL);
 
             if (final_join->quals == NULL) {
-                final_join->quals = whereCopy;
+                final_join->quals = whereClause;
             } else {
                 final_join->quals =
-                    (Node *)makeA_Expr(AEXPR_AND, NULL, whereCopy, final_join->quals, -1);
+                    (Node *)makeA_Expr(AEXPR_AND, NULL, whereClause, final_join->quals, -1);
             }
         }
     }
@@ -1700,17 +1772,11 @@ static SelectStmt *CreateStartWithCTEOuterBranch(ParseState *pstate,
 
     /* push whereClause down to init part, taking care to avoid NULL in expr. */
     quals = (Node *)startWithExpr;
-    Node* whereClauseCopy = (Node *)copyObject(whereClause);
-    if (whereClause != NULL) {
-        /* only join quals can be pushed down */
-        raw_expression_tree_walker((Node*)whereClauseCopy,
-            (bool (*)())walker_to_exclude_non_join_quals, (void*)NULL);
-    }
 
     if (quals == NULL) {
-        quals = whereClauseCopy;
+        quals = whereClause;
     } else if (whereClause != NULL) {
-        quals = (Node *)makeA_Expr(AEXPR_AND, NULL, whereClauseCopy,
+        quals = (Node *)makeA_Expr(AEXPR_AND, NULL, whereClause,
             (Node*)startWithExpr, -1);
     }
 
@@ -1878,15 +1944,19 @@ static void transformFromList(ParseState* pstate, Query* qry,
     A_Expr *startWithExpr = (A_Expr *)context->startWithExpr;
     A_Expr *connectByExpr = (A_Expr *)context->connectByExpr;
     List *relInfoList = context->relInfoList;
-    Node *whereClause = context->whereClause;
+    Node *whereClauseOnlyJoin = (Node *)copyObject(context->whereClause);
+
+    if (context->whereClause != NULL) {
+        split_where_expr_by_join(&whereClauseOnlyJoin, &(context->whereClause));
+    }
 
     /* make union-all branch for none-recursive part */
     SelectStmt *outerBranch = CreateStartWithCTEOuterBranch(pstate, context,
-                        relInfoList, (Node *)startWithExpr, whereClause);
+                        relInfoList, (Node *)startWithExpr, whereClauseOnlyJoin);
 
     /* make joinExpr for recursive part */
     SelectStmt *innerBranch = CreateStartWithCTEInnerBranch(pstate, context,
-                        relInfoList, (Node *)connectByExpr, whereClause);
+                        relInfoList, (Node *)connectByExpr, whereClauseOnlyJoin);
 
     CreateStartWithCTE(pstate, qry, outerBranch, innerBranch, context);
 
diff --git a/src/test/regress/expected/sw_bugfix-1.out b/src/test/regress/expected/sw_bugfix-1.out
index 7e753871a..805991a58 100644
--- a/src/test/regress/expected/sw_bugfix-1.out
+++ b/src/test/regress/expected/sw_bugfix-1.out
@@ -452,7 +452,6 @@ explain (costs off) select t1.ID,t1.VCH,pid,NAME,PTEX from TEST_HCB_FQB t1,TEST_
                                  QUERY PLAN                                 
 ----------------------------------------------------------------------------
  CTE Scan on tmp_reuslt
-   Filter: ("t1@id" = "t2@id")
    CTE tmp_reuslt
      ->  StartWith Operator
            Start With pseudo atts: RUITR, array_key_1
@@ -471,7 +470,7 @@ explain (costs off) select t1.ID,t1.VCH,pid,NAME,PTEX from TEST_HCB_FQB t1,TEST_
                                    ->  WorkTable Scan on tmp_reuslt
                                    ->  Hash
                                          ->  Seq Scan on test_hcb_fqb t1
-(20 rows)
+(19 rows)
 
 CREATE OR REPLACE FUNCTION test_hcb_pro1(i_id in int) return int
 AS
@@ -1840,3 +1839,29 @@ select max(name) from test3 connect by parentid = prior id group by sys_connect_
 drop table test3;
 drop table test2;
 drop table test1;
+drop table if exists left_table;
+drop table if exists right_table;
+create table left_table(id int);
+create table right_table(id int);
+declare
+i int:=0;
+begin
+for i in 1..5 loop
+insert into left_table values(i);
+insert into right_table values(i+1);
+end loop;
+commit;
+end;
+/
+select left_table.id as id1,right_table.id as id2 from left_table,right_table where left_table.id+1=right_table.id start with left_table.id=1 connect by prior right_table.id=left_table.id;
+ id1 | id2 
+-----+-----
+   1 |   2
+   2 |   3
+   3 |   4
+   4 |   5
+   5 |   6
+(5 rows)
+
+drop table left_table;
+drop table right_table;
diff --git a/src/test/regress/expected/sw_bugfix-2.out b/src/test/regress/expected/sw_bugfix-2.out
index 503977ec7..47da77ea0 100755
--- a/src/test/regress/expected/sw_bugfix-2.out
+++ b/src/test/regress/expected/sw_bugfix-2.out
@@ -2124,3 +2124,70 @@ connect by
 
 drop synonym sy_pf;
 drop table pf_org_rela_test;
+drop table if exists sw_test;
+ 
+-- test join clause in where split and push down into start with/connect by clause
+drop table if exists sw_tb_1;
+create table sw_tb_1(a int,b int,c int,d int);
+create table sw_tb_2(a int,b int,c int,d int);
+insert into sw_tb_1 values(1,1,1,1);
+insert into sw_tb_1 values(2,2,2,2);
+insert into sw_tb_1 values(3,3,3,3);
+insert into sw_tb_1 values(4,4,4,4);
+insert into sw_tb_2 values(1,1,1,1);
+insert into sw_tb_2 values(2,2,2,2);
+insert into sw_tb_2 values(3,3,3,3);
+insert into sw_tb_2 values(4,4,4,4);
+select * from sw_tb_1,sw_tb_2 where sw_tb_1.c=sw_tb_2.d start with sw_tb_1.a>2 connect by nocycle prior sw_tb_1.d=sw_tb_2.c;
+ a | b | c | d | a | b | c | d 
+---+---+---+---+---+---+---+---
+ 3 | 3 | 3 | 3 | 3 | 3 | 3 | 3
+ 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4
+(2 rows)
+
+select * from sw_tb_1,sw_tb_2 where (sw_tb_1.a=sw_tb_2.b or sw_tb_1.a not in (select 3)) and sw_tb_1.c=sw_tb_2.d start with sw_tb_1.a>2 connect by nocycle prior sw_tb_1.d=sw_tb_2.c;
+ a | b | c | d | a | b | c | d 
+---+---+---+---+---+---+---+---
+ 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4
+(1 row)
+
+select * from sw_tb_1,sw_tb_2 where sw_tb_1.a !=3 or sw_tb_1.c=sw_tb_2.d start with sw_tb_1.a>2 connect by nocycle prior sw_tb_1.d=sw_tb_2.c;
+ a | b | c | d | a | b | c | d 
+---+---+---+---+---+---+---+---
+ 4 | 4 | 4 | 4 | 4 | 4 | 4 | 4
+(1 row)
+
+select * from sw_tb_1,sw_tb_2 where (sw_tb_1.a+sw_tb_1.b=sw_tb_2.b or sw_tb_1.a=sw_tb_2.c) and (sw_tb_1.b=sw_tb_2.a or (substr(sw_tb_1.b,2)=substr(sw_tb_2.b,2) and sw_tb_1.b is null)) or (sw_tb_1.c=sw_tb_2.d or sw_tb_1.b!=2) start with sw_tb_1.a=2 connect by nocycle prior sw_tb_1.d=sw_tb_2.c;
+ a | b | c | d | a | b | c | d 
+---+---+---+---+---+---+---+---
+(0 rows)
+
+explain select * from sw_tb_1,sw_tb_2 where (sw_tb_1.a+sw_tb_1.b=sw_tb_2.b or sw_tb_1.a=sw_tb_2.c) and (sw_tb_1.b=sw_tb_2.a or (substr(sw_tb_1.b,2)=substr(sw_tb_2.b,2) and sw_tb_1.b is null)) or (sw_tb_1.c=sw_tb_2.d or sw_tb_1.b!=2) start with sw_tb_1.a=2 connect by nocycle prior sw_tb_1.d=sw_tb_2.c;
+                                                                                                                                                          QUERY PLAN                                                                                                                                                          
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ CTE Scan on tmp_reuslt  (cost=1205332.35..1220171.82 rows=652970 width=32)
+   Filter: (("sw_tb_1@b" IS NULL) OR ("sw_tb_1@b" <> 2))
+   CTE tmp_reuslt
+     ->  StartWith Operator  (cost=0.00..1205332.35 rows=659532 width=32)
+           Start With pseudo atts: RUITR, array_key_4
+           ->  Recursive Union  (cost=0.00..1205332.35 rows=659532 width=32)
+                 ->  Nested Loop  (cost=0.00..659.38 rows=82 width=32)
+                       Join Filter: (((((swtest.sw_tb_1.a + swtest.sw_tb_1.b) = swtest.sw_tb_2.b) OR (swtest.sw_tb_1.a = swtest.sw_tb_2.c)) AND ((swtest.sw_tb_1.b = swtest.sw_tb_2.a) OR (substr((swtest.sw_tb_1.b)::text, 2) = substr((swtest.sw_tb_2.b)::text, 2)))) OR (swtest.sw_tb_1.c = swtest.sw_tb_2.d))
+                       ->  Seq Scan on sw_tb_2  (cost=0.00..27.76 rows=1776 width=16)
+                       ->  Materialize  (cost=0.00..32.25 rows=9 width=16)
+                             ->  Seq Scan on sw_tb_1  (cost=0.00..32.20 rows=9 width=16)
+                                   Filter: (a = 2)
+                 ->  Hash Join  (cost=26.65..119148.23 rows=65945 width=32)
+                       Hash Cond: (swtest.sw_tb_2.c = tmp_reuslt."sw_tb_1@d")
+                       ->  Materialize  (cost=0.00..118341.56 rows=16083 width=32)
+                             ->  Nested Loop  (cost=0.00..118341.56 rows=16083 width=32)
+                                   Join Filter: (((((swtest.sw_tb_1.a + swtest.sw_tb_1.b) = swtest.sw_tb_2.b) OR (swtest.sw_tb_1.a = swtest.sw_tb_2.c)) AND ((swtest.sw_tb_1.b = swtest.sw_tb_2.a) OR (substr((swtest.sw_tb_1.b)::text, 2) = substr((swtest.sw_tb_2.b)::text, 2)))) OR (swtest.sw_tb_1.c = swtest.sw_tb_2.d))
+                                   ->  Seq Scan on sw_tb_1  (cost=0.00..27.76 rows=1776 width=16)
+                                   ->  Materialize  (cost=0.00..36.64 rows=1776 width=16)
+                                         ->  Seq Scan on sw_tb_2  (cost=0.00..27.76 rows=1776 width=16)
+                       ->  Hash  (cost=16.40..16.40 rows=820 width=4)
+                             ->  WorkTable Scan on tmp_reuslt  (cost=0.00..16.40 rows=820 width=4)
+(22 rows)
+
+drop table sw_tb_1;
+drop table sw_tb_2;
diff --git a/src/test/regress/expected/sw_icbc.out b/src/test/regress/expected/sw_icbc.out
index 6dd487021..3c07167aa 100644
--- a/src/test/regress/expected/sw_icbc.out
+++ b/src/test/regress/expected/sw_icbc.out
@@ -512,7 +512,6 @@ explain (costs off) select * from t1, t2 where t1.id = t2.id start with t1.id =
                                  QUERY PLAN                                  
 -----------------------------------------------------------------------------
  CTE Scan on tmp_reuslt
-   Filter: ("t1@id" = "t2@id")
    CTE tmp_reuslt
      ->  StartWith Operator
            Start With pseudo atts: RUITR, array_key_1
@@ -531,7 +530,7 @@ explain (costs off) select * from t1, t2 where t1.id = t2.id start with t1.id =
                                    ->  WorkTable Scan on tmp_reuslt
                        ->  Hash
                              ->  Seq Scan on t2
-(20 rows)
+(19 rows)
 
 explain (costs off) select * from t1 join t2 on t1.id = t2.id start with t1.id = t2.id and t1.id = 1 connect by prior t1.id = t1.pid;
                                  QUERY PLAN                                  
@@ -561,7 +560,6 @@ explain (costs off) select * from t1, (select * from t2) as test where t1.id = t
                                  QUERY PLAN                                  
 -----------------------------------------------------------------------------
  CTE Scan on tmp_reuslt
-   Filter: ("t1@id" = "test@id")
    CTE tmp_reuslt
      ->  StartWith Operator
            Start With pseudo atts: RUITR, array_key_1
@@ -580,7 +578,7 @@ explain (costs off) select * from t1, (select * from t2) as test where t1.id = t
                                    ->  WorkTable Scan on tmp_reuslt
                        ->  Hash
                              ->  Seq Scan on t2
-(20 rows)
+(19 rows)
 
 explain (costs off) select id, (select id from t2 start with t2.id = t1.id connect by t2.id = t1.id limit 1) from t1 where id = 1;
 ERROR:  START WITH CONNECT BY clauses must have at least one prior key.
diff --git a/src/test/regress/sql/sw_bugfix-1.sql b/src/test/regress/sql/sw_bugfix-1.sql
index b651d6541..73b8a05b2 100644
--- a/src/test/regress/sql/sw_bugfix-1.sql
+++ b/src/test/regress/sql/sw_bugfix-1.sql
@@ -522,3 +522,24 @@ select max(name) from test3 connect by parentid = prior id group by sys_connect_
 drop table test3;
 drop table test2;
 drop table test1;
+
+drop table if exists left_table;
+drop table if exists right_table;
+create table left_table(id int);
+create table right_table(id int);
+
+declare
+i int:=0;
+begin
+for i in 1..5 loop
+insert into left_table values(i);
+insert into right_table values(i+1);
+end loop;
+commit;
+end;
+/
+
+select left_table.id as id1,right_table.id as id2 from left_table,right_table where left_table.id+1=right_table.id start with left_table.id=1 connect by prior right_table.id=left_table.id;
+
+drop table left_table;
+drop table right_table;
\ No newline at end of file
diff --git a/src/test/regress/sql/sw_bugfix-2.sql b/src/test/regress/sql/sw_bugfix-2.sql
index ae91aaa59..bc6a37187 100644
--- a/src/test/regress/sql/sw_bugfix-2.sql
+++ b/src/test/regress/sql/sw_bugfix-2.sql
@@ -809,4 +809,26 @@ connect by
     and sy_pf.org_rela_type = 'ADMINISTRATION';
 
 drop synonym sy_pf;
-drop table pf_org_rela_test;
\ No newline at end of file
+drop table pf_org_rela_test;
+
+drop table if exists sw_test;
+ 
+-- test join clause in where split and push down into start with/connect by clause
+drop table if exists sw_tb_1;
+create table sw_tb_1(a int,b int,c int,d int);
+create table sw_tb_2(a int,b int,c int,d int);
+insert into sw_tb_1 values(1,1,1,1);
+insert into sw_tb_1 values(2,2,2,2);
+insert into sw_tb_1 values(3,3,3,3);
+insert into sw_tb_1 values(4,4,4,4);
+insert into sw_tb_2 values(1,1,1,1);
+insert into sw_tb_2 values(2,2,2,2);
+insert into sw_tb_2 values(3,3,3,3);
+insert into sw_tb_2 values(4,4,4,4);
+select * from sw_tb_1,sw_tb_2 where sw_tb_1.c=sw_tb_2.d start with sw_tb_1.a>2 connect by nocycle prior sw_tb_1.d=sw_tb_2.c;
+select * from sw_tb_1,sw_tb_2 where (sw_tb_1.a=sw_tb_2.b or sw_tb_1.a not in (select 3)) and sw_tb_1.c=sw_tb_2.d start with sw_tb_1.a>2 connect by nocycle prior sw_tb_1.d=sw_tb_2.c;
+select * from sw_tb_1,sw_tb_2 where sw_tb_1.a !=3 or sw_tb_1.c=sw_tb_2.d start with sw_tb_1.a>2 connect by nocycle prior sw_tb_1.d=sw_tb_2.c;
+select * from sw_tb_1,sw_tb_2 where (sw_tb_1.a+sw_tb_1.b=sw_tb_2.b or sw_tb_1.a=sw_tb_2.c) and (sw_tb_1.b=sw_tb_2.a or (substr(sw_tb_1.b,2)=substr(sw_tb_2.b,2) and sw_tb_1.b is null)) or (sw_tb_1.c=sw_tb_2.d or sw_tb_1.b!=2) start with sw_tb_1.a=2 connect by nocycle prior sw_tb_1.d=sw_tb_2.c;
+explain select * from sw_tb_1,sw_tb_2 where (sw_tb_1.a+sw_tb_1.b=sw_tb_2.b or sw_tb_1.a=sw_tb_2.c) and (sw_tb_1.b=sw_tb_2.a or (substr(sw_tb_1.b,2)=substr(sw_tb_2.b,2) and sw_tb_1.b is null)) or (sw_tb_1.c=sw_tb_2.d or sw_tb_1.b!=2) start with sw_tb_1.a=2 connect by nocycle prior sw_tb_1.d=sw_tb_2.c;
+drop table sw_tb_1;
+drop table sw_tb_2;
\ No newline at end of file