[opt](nereids) optimize filter estimation for pattern "col=col" #18716

Tpc-h q10 and q5 benefit from this optimization.

For a given hash join condition, A=B, sometimes both A and B are reduced by filters. In this pr, both reductions are counted in join estimation.
This commit is contained in:
minghong
2023-04-17 11:44:35 +08:00
committed by GitHub
parent b5b0148010
commit a2278dbc6c
5 changed files with 67 additions and 56 deletions

View File

@ -389,7 +389,22 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo
rightBuilder.setNdv(rightIntersectLeft.getDistinctValues());
rightBuilder.setMinValue(rightIntersectLeft.getLow());
rightBuilder.setMaxValue(rightIntersectLeft.getDistinctValues());
double sel = 1 / StatsMathUtil.nonZeroDivisor(Math.max(leftStats.ndv, rightStats.ndv));
double sel;
double reduceRatio = 0.25;
double bothSideReducedRatio = 0.9;
if (leftStats.ndv < leftStats.originalNdv * bothSideReducedRatio
&& rightStats.ndv < rightStats.originalNdv * bothSideReducedRatio) {
double sel1;
if (leftStats.ndv > rightStats.ndv) {
sel1 = 1 / StatsMathUtil.nonZeroDivisor(leftStats.ndv);
} else {
sel1 = 1 / StatsMathUtil.nonZeroDivisor(rightStats.ndv);
}
double sel2 = Math.min(rightStats.ndv / rightStats.originalNdv, leftStats.ndv / leftStats.originalNdv);
sel = sel1 * Math.pow(sel2, reduceRatio);
} else {
sel = 1 / StatsMathUtil.nonZeroDivisor(Math.max(leftStats.ndv, rightStats.ndv));
}
Statistics updatedStatistics = context.statistics.withSel(sel);
updatedStatistics.addColumnStats(leftExpr, leftBuilder.build());
updatedStatistics.addColumnStats(rightExpr, rightBuilder.build());

View File

@ -5,22 +5,20 @@ PhysicalTopN
----PhysicalTopN
------PhysicalProject
--------hashAgg[LOCAL]
----------PhysicalDistribute
------------PhysicalProject
--------------hashJoin[INNER_JOIN](lineitem.l_orderkey = orders.o_orderkey)
----------PhysicalProject
------------hashJoin[INNER_JOIN](customer.c_nationkey = nation.n_nationkey)
--------------hashJoin[INNER_JOIN](customer.c_custkey = orders.o_custkey)
----------------PhysicalProject
------------------filter((lineitem.l_returnflag = 'R'))
--------------------PhysicalOlapScan[lineitem]
------------------PhysicalOlapScan[customer]
----------------PhysicalDistribute
------------------hashJoin[INNER_JOIN](customer.c_nationkey = nation.n_nationkey)
--------------------hashJoin[INNER_JOIN](customer.c_custkey = orders.o_custkey)
----------------------PhysicalProject
------------------------PhysicalOlapScan[customer]
----------------------PhysicalDistribute
------------------------PhysicalProject
--------------------------filter((orders.o_orderdate < 1994-01-01)(orders.o_orderdate >= 1993-10-01))
----------------------------PhysicalOlapScan[orders]
--------------------PhysicalDistribute
----------------------PhysicalProject
------------------------PhysicalOlapScan[nation]
------------------hashJoin[INNER_JOIN](lineitem.l_orderkey = orders.o_orderkey)
--------------------PhysicalProject
----------------------filter((lineitem.l_returnflag = 'R'))
------------------------PhysicalOlapScan[lineitem]
--------------------PhysicalProject
----------------------filter((orders.o_orderdate < 1994-01-01)(orders.o_orderdate >= 1993-10-01))
------------------------PhysicalOlapScan[orders]
--------------PhysicalDistribute
----------------PhysicalProject
------------------PhysicalOlapScan[nation]

View File

@ -9,27 +9,27 @@ PhysicalQuickSort
------------PhysicalProject
--------------hashJoin[INNER_JOIN](customer.c_nationkey = supplier.s_nationkey)(customer.c_custkey = orders.o_custkey)
----------------PhysicalProject
------------------hashJoin[INNER_JOIN](lineitem.l_suppkey = supplier.s_suppkey)
--------------------hashJoin[INNER_JOIN](lineitem.l_orderkey = orders.o_orderkey)
----------------------PhysicalProject
------------------------PhysicalOlapScan[lineitem]
----------------------PhysicalProject
------------------------filter((orders.o_orderdate < 1995-01-01)(orders.o_orderdate >= 1994-01-01))
--------------------------PhysicalOlapScan[orders]
--------------------PhysicalDistribute
----------------------PhysicalProject
------------------------hashJoin[INNER_JOIN](supplier.s_nationkey = nation.n_nationkey)
--------------------------PhysicalProject
----------------------------PhysicalOlapScan[supplier]
--------------------------PhysicalDistribute
----------------------------hashJoin[INNER_JOIN](nation.n_regionkey = region.r_regionkey)
------------------------------PhysicalProject
--------------------------------PhysicalOlapScan[nation]
------------------------------PhysicalDistribute
--------------------------------PhysicalProject
----------------------------------filter((region.r_name = 'ASIA'))
------------------------------------PhysicalOlapScan[region]
------------------PhysicalOlapScan[customer]
----------------PhysicalDistribute
------------------PhysicalProject
--------------------PhysicalOlapScan[customer]
--------------------hashJoin[INNER_JOIN](lineitem.l_suppkey = supplier.s_suppkey)
----------------------hashJoin[INNER_JOIN](lineitem.l_orderkey = orders.o_orderkey)
------------------------PhysicalProject
--------------------------PhysicalOlapScan[lineitem]
------------------------PhysicalProject
--------------------------filter((orders.o_orderdate < 1995-01-01)(orders.o_orderdate >= 1994-01-01))
----------------------------PhysicalOlapScan[orders]
----------------------PhysicalDistribute
------------------------PhysicalProject
--------------------------hashJoin[INNER_JOIN](supplier.s_nationkey = nation.n_nationkey)
----------------------------PhysicalProject
------------------------------PhysicalOlapScan[supplier]
----------------------------PhysicalDistribute
------------------------------hashJoin[INNER_JOIN](nation.n_regionkey = region.r_regionkey)
--------------------------------PhysicalProject
----------------------------------PhysicalOlapScan[nation]
--------------------------------PhysicalDistribute
----------------------------------PhysicalProject
------------------------------------filter((region.r_name = 'ASIA'))
--------------------------------------PhysicalOlapScan[region]

View File

@ -5,22 +5,20 @@ PhysicalTopN
----PhysicalTopN
------PhysicalProject
--------hashAgg[LOCAL]
----------PhysicalDistribute
------------PhysicalProject
--------------hashJoin[INNER_JOIN](lineitem.l_orderkey = orders.o_orderkey)
----------PhysicalProject
------------hashJoin[INNER_JOIN](customer.c_nationkey = nation.n_nationkey)
--------------hashJoin[INNER_JOIN](customer.c_custkey = orders.o_custkey)
----------------PhysicalProject
------------------filter((lineitem.l_returnflag = 'R'))
--------------------PhysicalOlapScan[lineitem]
------------------PhysicalOlapScan[customer]
----------------PhysicalDistribute
------------------hashJoin[INNER_JOIN](customer.c_nationkey = nation.n_nationkey)
--------------------hashJoin[INNER_JOIN](customer.c_custkey = orders.o_custkey)
----------------------PhysicalProject
------------------------PhysicalOlapScan[customer]
----------------------PhysicalDistribute
------------------------PhysicalProject
--------------------------filter((orders.o_orderdate < 1994-01-01)(orders.o_orderdate >= 1993-10-01))
----------------------------PhysicalOlapScan[orders]
--------------------PhysicalDistribute
----------------------PhysicalProject
------------------------PhysicalOlapScan[nation]
------------------hashJoin[INNER_JOIN](lineitem.l_orderkey = orders.o_orderkey)
--------------------PhysicalProject
----------------------filter((lineitem.l_returnflag = 'R'))
------------------------PhysicalOlapScan[lineitem]
--------------------PhysicalProject
----------------------filter((orders.o_orderdate < 1994-01-01)(orders.o_orderdate >= 1993-10-01))
------------------------PhysicalOlapScan[orders]
--------------PhysicalDistribute
----------------PhysicalProject
------------------PhysicalOlapScan[nation]

View File

@ -8,6 +8,8 @@ PhysicalQuickSort
----------hashAgg[LOCAL]
------------PhysicalProject
--------------hashJoin[INNER_JOIN](customer.c_nationkey = supplier.s_nationkey)(customer.c_custkey = orders.o_custkey)
----------------PhysicalProject
------------------PhysicalOlapScan[customer]
----------------PhysicalDistribute
------------------PhysicalProject
--------------------hashJoin[INNER_JOIN](lineitem.l_suppkey = supplier.s_suppkey)
@ -30,6 +32,4 @@ PhysicalQuickSort
----------------------------------PhysicalProject
------------------------------------filter((region.r_name = 'ASIA'))
--------------------------------------PhysicalOlapScan[region]
----------------PhysicalProject
------------------PhysicalOlapScan[customer]