From 34c85c962f24b5aacfa276785146a810384d0cde Mon Sep 17 00:00:00 2001 From: minghong Date: Fri, 1 Dec 2023 15:48:33 +0800 Subject: [PATCH] [opt](Nereids) improve semi/anti join estimation when column stats are unavailable #27793 this change improves performance of tpch q20. on sf500, improved from 6.3sec to 1.1 sec this change has no impaction on tpcds when column stats is unknown, the basic algorithm to estimate left semi join output row count is its left child output row count. q1: "A left semi join B on A.x=B.x" the output row is estimated as A.rowCount. But the basic algorithm is not good to following pattern: q2: "A left semi join filter(B) on A.x=B.x" Because there is a filter on B, usually this left semi join also reduce the row count of A, and we estimate the output of q2 as A.rowCount * Filter.rowCount/B.rowCount --- .../doris/nereids/stats/JoinEstimation.java | 28 +++- .../doris/nereids/stats/StatsCalculator.java | 15 +- .../plans/physical/AbstractPhysicalJoin.java | 4 +- .../plans/physical/PhysicalOlapScan.java | 5 +- .../doris/statistics/ColumnStatistic.java | 3 +- .../transposeJoin/transposeSemiJoinAgg.out | 157 ++++++------------ .../nostats_rf_prune/q20-rewrite.out | 37 +++-- .../nostats_rf_prune/q20.out | 47 +++--- .../nostats_rf_prune/q21.out | 42 ++--- .../shape_no_stats/q20-rewrite.out | 37 +++-- .../shape_no_stats/q20.out | 47 +++--- .../shape_no_stats/q21.out | 42 ++--- .../transposeJoin/transposeSemiJoinAgg.groovy | 1 + 13 files changed, 232 insertions(+), 233 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java index d43171375b..3173b654d4 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java @@ -20,6 +20,7 @@ package org.apache.doris.nereids.stats; import org.apache.doris.nereids.exceptions.AnalysisException; import org.apache.doris.nereids.trees.expressions.Cast; import org.apache.doris.nereids.trees.expressions.EqualPredicate; +import org.apache.doris.nereids.trees.expressions.EqualTo; import org.apache.doris.nereids.trees.expressions.Expression; import org.apache.doris.nereids.trees.expressions.Slot; import org.apache.doris.nereids.trees.plans.JoinType; @@ -44,6 +45,7 @@ import java.util.stream.Collectors; */ public class JoinEstimation { private static double DEFAULT_ANTI_JOIN_SELECTIVITY_COEFFICIENT = 0.3; + private static double UNKNOWN_COL_STATS_FILTER_SEL_LOWER_BOUND = 0.5; private static EqualPredicate normalizeHashJoinCondition(EqualPredicate equal, Statistics leftStats, Statistics rightStats) { @@ -169,6 +171,27 @@ public class JoinEstimation { .build(); } + private static double computeSelectivityForBuildSideWhenColStatsUnknown(Statistics buildStats, Join join) { + double sel = 1.0; + for (Expression cond : join.getHashJoinConjuncts()) { + if (cond instanceof EqualTo) { + EqualTo equal = (EqualTo) cond; + if (equal.left() instanceof Slot && equal.right() instanceof Slot) { + ColumnStatistic buildColStats = buildStats.findColumnStatistics(equal.left()); + if (buildColStats == null) { + buildColStats = buildStats.findColumnStatistics(equal.right()); + } + if (buildColStats != null) { + double buildSel = Math.min(buildStats.getRowCount() / buildColStats.count, 1.0); + buildSel = Math.max(buildSel, UNKNOWN_COL_STATS_FILTER_SEL_LOWER_BOUND); + sel = Math.min(sel, buildSel); + } + } + } + } + return sel; + } + private static Statistics estimateInnerJoin(Statistics leftStats, Statistics rightStats, Join join) { if (hashJoinConditionContainsUnknownColumnStats(leftStats, rightStats, join)) { double rowCount = Math.max(leftStats.getRowCount(), rightStats.getRowCount()); @@ -245,14 +268,15 @@ public class JoinEstimation { private static Statistics estimateSemiOrAnti(Statistics leftStats, Statistics rightStats, Join join) { if (hashJoinConditionContainsUnknownColumnStats(leftStats, rightStats, join)) { + double sel = computeSelectivityForBuildSideWhenColStatsUnknown(rightStats, join); if (join.getJoinType().isLeftSemiOrAntiJoin()) { - return new StatisticsBuilder().setRowCount(leftStats.getRowCount()) + return new StatisticsBuilder().setRowCount(leftStats.getRowCount() * sel) .putColumnStatistics(leftStats.columnStatistics()) .putColumnStatistics(rightStats.columnStatistics()) .build(); } else { //right semi or anti - return new StatisticsBuilder().setRowCount(rightStats.getRowCount()) + return new StatisticsBuilder().setRowCount(rightStats.getRowCount() * sel) .putColumnStatistics(leftStats.columnStatistics()) .putColumnStatistics(rightStats.columnStatistics()) .build(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java index 741fe60e20..5cd650bbb5 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java @@ -668,7 +668,20 @@ public class StatsCalculator extends DefaultPlanVisitor { columnStatisticMap.put(slotReference, ColumnStatistic.UNKNOWN); } } - return new Statistics(rowCount, columnStatisticMap); + Statistics stats = new Statistics(rowCount, columnStatisticMap); + stats = normalizeCatalogRelationColumnStatsRowCount(stats); + return stats; + } + + private Statistics normalizeCatalogRelationColumnStatsRowCount(Statistics stats) { + for (Expression slot : stats.columnStatistics().keySet()) { + ColumnStatistic colStats = stats.findColumnStatistics(slot); + Preconditions.checkArgument(colStats != null, + "can not find col stats for %s in table", slot.toSql()); + stats.addColumnStats(slot, + new ColumnStatisticBuilder(colStats).setCount(stats.getRowCount()).build()); + } + return stats; } private Statistics computeTopN(TopN topN) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/AbstractPhysicalJoin.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/AbstractPhysicalJoin.java index c6bdd7b6b2..6e9cfbb643 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/AbstractPhysicalJoin.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/AbstractPhysicalJoin.java @@ -228,9 +228,9 @@ public abstract class AbstractPhysicalJoin< @Override public String toString() { List args = Lists.newArrayList("type", joinType, + "stats", statistics, "hashCondition", hashJoinConjuncts, - "otherCondition", otherJoinConjuncts, - "stats", statistics); + "otherCondition", otherJoinConjuncts); if (markJoinSlotReference.isPresent()) { args.add("isMarkJoin"); args.add("true"); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalOlapScan.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalOlapScan.java index 20c4ebf795..a4a2d7254b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalOlapScan.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalOlapScan.java @@ -121,8 +121,9 @@ public class PhysicalOlapScan extends PhysicalCatalogRelation implements OlapSca @Override public String toString() { return Utils.toSqlString("PhysicalOlapScan[" + id.asInt() + "]" + getGroupIdWithPrefix(), - "qualified", Utils.qualifiedName(qualifier, table.getName()), - "stats", statistics, "fr", getMutableState(AbstractPlan.FRAGMENT_ID) + "table", table.getName(), + "stats", statistics, + "fr", getMutableState(AbstractPlan.FRAGMENT_ID) ); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java index d1af7756f8..72d0c68f0f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java @@ -298,7 +298,8 @@ public class ColumnStatistic { @Override public String toString() { - return isUnKnown ? "unknown" : String.format("ndv=%.4f, min=%f(%s), max=%f(%s), count=%.4f, avgSizeByte=%f", + return isUnKnown ? "unknown(" + count + ")" + : String.format("ndv=%.4f, min=%f(%s), max=%f(%s), count=%.4f, avgSizeByte=%f", ndv, minValue, minExpr, maxValue, maxExpr, count, avgSizeByte); } diff --git a/regression-test/data/nereids_rules_p0/transposeJoin/transposeSemiJoinAgg.out b/regression-test/data/nereids_rules_p0/transposeJoin/transposeSemiJoinAgg.out index 49cf927de5..4865cbdc7b 100644 --- a/regression-test/data/nereids_rules_p0/transposeJoin/transposeSemiJoinAgg.out +++ b/regression-test/data/nereids_rules_p0/transposeJoin/transposeSemiJoinAgg.out @@ -1,130 +1,81 @@ -- This file is automatically generated. You should know what you did if you want to edit this -- !groupby_positive_case -- PhysicalResultSink ---PhysicalDistribute -----PhysicalProject -------hashAgg[LOCAL] ---------hashJoin[LEFT_SEMI_JOIN] hashCondition=((T3.a = T2.a)) otherCondition=() build RFs:RF0 a->[a] -----------PhysicalProject -------------filter((T1.__DORIS_DELETE_SIGN__ = 0)) ---------------PhysicalOlapScan[T1] apply RFs: RF0 -----------PhysicalDistribute -------------PhysicalProject ---------------filter((T2.__DORIS_DELETE_SIGN__ = 0)) -----------------PhysicalOlapScan[T2] +--hashAgg[LOCAL] +----hashJoin[LEFT_SEMI_JOIN] hashCondition=((T3.a = T2.a)) otherCondition=() build RFs:RF0 a->[a] +------filter((T1.__DORIS_DELETE_SIGN__ = 0)) +--------PhysicalOlapScan[T1] apply RFs: RF0 +------filter((T2.__DORIS_DELETE_SIGN__ = 0)) +--------PhysicalOlapScan[T2] -- !groupby_negative_case -- PhysicalResultSink ---PhysicalDistribute -----PhysicalProject -------hashJoin[LEFT_SEMI_JOIN] hashCondition=((T3.D = expr_cast(a as BIGINT))) otherCondition=() ---------PhysicalDistribute -----------PhysicalProject -------------hashAgg[LOCAL] ---------------PhysicalProject -----------------filter((T1.__DORIS_DELETE_SIGN__ = 0)) -------------------PhysicalOlapScan[T1] ---------PhysicalDistribute -----------PhysicalProject -------------filter((T2.__DORIS_DELETE_SIGN__ = 0)) ---------------PhysicalOlapScan[T2] +--hashJoin[LEFT_SEMI_JOIN] hashCondition=((T3.D = expr_cast(a as BIGINT))) otherCondition=() +----hashAgg[LOCAL] +------filter((T1.__DORIS_DELETE_SIGN__ = 0)) +--------PhysicalOlapScan[T1] +----filter((T2.__DORIS_DELETE_SIGN__ = 0)) +------PhysicalOlapScan[T2] -- !grouping_positive_case -- PhysicalResultSink ---PhysicalDistribute -----PhysicalProject +--hashJoin[LEFT_SEMI_JOIN] hashCondition=((T3.a = T2.a)) otherCondition=() build RFs:RF0 a->[a] +----hashAgg[GLOBAL] ------hashAgg[LOCAL] ---------hashJoin[LEFT_SEMI_JOIN] hashCondition=((T3.a = T2.a)) otherCondition=() build RFs:RF0 a->[a] -----------PhysicalDistribute -------------PhysicalRepeat ---------------PhysicalProject -----------------filter((T1.__DORIS_DELETE_SIGN__ = 0)) -------------------PhysicalOlapScan[T1] apply RFs: RF0 -----------PhysicalDistribute -------------PhysicalProject ---------------filter((T2.__DORIS_DELETE_SIGN__ = 0)) -----------------PhysicalOlapScan[T2] +--------PhysicalRepeat +----------filter((T1.__DORIS_DELETE_SIGN__ = 0)) +------------PhysicalOlapScan[T1] apply RFs: RF0 +----filter((T2.__DORIS_DELETE_SIGN__ = 0)) +------PhysicalOlapScan[T2] -- !grouping_negative_case -- PhysicalResultSink ---PhysicalDistribute -----PhysicalProject -------hashJoin[LEFT_SEMI_JOIN] hashCondition=((T3.D = expr_cast(a as BIGINT))) otherCondition=() ---------PhysicalDistribute -----------PhysicalProject -------------hashAgg[GLOBAL] ---------------PhysicalDistribute -----------------hashAgg[LOCAL] -------------------PhysicalRepeat ---------------------PhysicalProject -----------------------filter((T1.__DORIS_DELETE_SIGN__ = 0)) -------------------------PhysicalOlapScan[T1] ---------PhysicalDistribute -----------PhysicalProject -------------filter((T2.__DORIS_DELETE_SIGN__ = 0)) ---------------PhysicalOlapScan[T2] +--hashJoin[LEFT_SEMI_JOIN] hashCondition=((T3.D = expr_cast(a as BIGINT))) otherCondition=() +----hashAgg[GLOBAL] +------hashAgg[LOCAL] +--------PhysicalRepeat +----------filter((T1.__DORIS_DELETE_SIGN__ = 0)) +------------PhysicalOlapScan[T1] +----filter((T2.__DORIS_DELETE_SIGN__ = 0)) +------PhysicalOlapScan[T2] -- !groupby_positive_case2 -- PhysicalResultSink ---PhysicalDistribute -----hashAgg[LOCAL] -------hashJoin[LEFT_SEMI_JOIN] hashCondition=((T3.a = T2.a)) otherCondition=() build RFs:RF0 a->[a] ---------PhysicalProject -----------filter((T1.__DORIS_DELETE_SIGN__ = 0)) -------------PhysicalOlapScan[T1] apply RFs: RF0 ---------PhysicalDistribute -----------PhysicalProject -------------filter((T2.__DORIS_DELETE_SIGN__ = 0)) ---------------PhysicalOlapScan[T2] +--hashAgg[LOCAL] +----hashJoin[LEFT_SEMI_JOIN] hashCondition=((T3.a = T2.a)) otherCondition=() build RFs:RF0 a->[a] +------filter((T1.__DORIS_DELETE_SIGN__ = 0)) +--------PhysicalOlapScan[T1] apply RFs: RF0 +------filter((T2.__DORIS_DELETE_SIGN__ = 0)) +--------PhysicalOlapScan[T2] -- !groupby_negative_case2 -- PhysicalResultSink ---PhysicalDistribute -----PhysicalProject -------hashJoin[LEFT_SEMI_JOIN] hashCondition=((T3.D = expr_cast(a as BIGINT))) otherCondition=() ---------PhysicalDistribute -----------PhysicalProject -------------hashAgg[LOCAL] ---------------PhysicalProject -----------------filter((T1.__DORIS_DELETE_SIGN__ = 0)) -------------------PhysicalOlapScan[T1] ---------PhysicalDistribute -----------PhysicalProject -------------filter((T2.__DORIS_DELETE_SIGN__ = 0)) ---------------PhysicalOlapScan[T2] +--hashJoin[LEFT_SEMI_JOIN] hashCondition=((T3.D = expr_cast(a as BIGINT))) otherCondition=() +----hashAgg[LOCAL] +------filter((T1.__DORIS_DELETE_SIGN__ = 0)) +--------PhysicalOlapScan[T1] +----filter((T2.__DORIS_DELETE_SIGN__ = 0)) +------PhysicalOlapScan[T2] -- !grouping_positive_case2 -- PhysicalResultSink ---PhysicalDistribute -----PhysicalProject +--hashJoin[LEFT_SEMI_JOIN] hashCondition=((T3.a = T2.a)) otherCondition=() build RFs:RF0 a->[a] +----hashAgg[GLOBAL] ------hashAgg[LOCAL] ---------hashJoin[LEFT_SEMI_JOIN] hashCondition=((T3.a = T2.a)) otherCondition=() build RFs:RF0 a->[a] -----------PhysicalDistribute -------------PhysicalRepeat ---------------PhysicalProject -----------------filter((T1.__DORIS_DELETE_SIGN__ = 0)) -------------------PhysicalOlapScan[T1] apply RFs: RF0 -----------PhysicalDistribute -------------PhysicalProject ---------------filter((T2.__DORIS_DELETE_SIGN__ = 0)) -----------------PhysicalOlapScan[T2] +--------PhysicalRepeat +----------filter((T1.__DORIS_DELETE_SIGN__ = 0)) +------------PhysicalOlapScan[T1] apply RFs: RF0 +----filter((T2.__DORIS_DELETE_SIGN__ = 0)) +------PhysicalOlapScan[T2] -- !grouping_negative_case2 -- PhysicalResultSink ---PhysicalDistribute -----PhysicalProject -------hashJoin[LEFT_SEMI_JOIN] hashCondition=((T3.D = expr_cast(a as BIGINT))) otherCondition=() ---------PhysicalDistribute -----------PhysicalProject -------------hashAgg[GLOBAL] ---------------PhysicalDistribute -----------------hashAgg[LOCAL] -------------------PhysicalRepeat ---------------------PhysicalProject -----------------------filter((T1.__DORIS_DELETE_SIGN__ = 0)) -------------------------PhysicalOlapScan[T1] ---------PhysicalDistribute -----------PhysicalProject -------------filter((T2.__DORIS_DELETE_SIGN__ = 0)) ---------------PhysicalOlapScan[T2] +--hashJoin[LEFT_SEMI_JOIN] hashCondition=((T3.D = expr_cast(a as BIGINT))) otherCondition=() +----hashAgg[GLOBAL] +------hashAgg[LOCAL] +--------PhysicalRepeat +----------filter((T1.__DORIS_DELETE_SIGN__ = 0)) +------------PhysicalOlapScan[T1] +----filter((T2.__DORIS_DELETE_SIGN__ = 0)) +------PhysicalOlapScan[T2] diff --git a/regression-test/data/nereids_tpch_shape_sf1000_p0/nostats_rf_prune/q20-rewrite.out b/regression-test/data/nereids_tpch_shape_sf1000_p0/nostats_rf_prune/q20-rewrite.out index 7c1b5920e4..525bc7be55 100644 --- a/regression-test/data/nereids_tpch_shape_sf1000_p0/nostats_rf_prune/q20-rewrite.out +++ b/regression-test/data/nereids_tpch_shape_sf1000_p0/nostats_rf_prune/q20-rewrite.out @@ -5,30 +5,31 @@ PhysicalResultSink ----PhysicalDistribute ------PhysicalQuickSort[LOCAL_SORT] --------PhysicalProject -----------hashJoin[RIGHT_SEMI_JOIN] hashCondition=((supplier.s_suppkey = t3.ps_suppkey)) otherCondition=() build RFs:RF4 s_suppkey->[ps_suppkey] -------------PhysicalDistribute ---------------PhysicalProject -----------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((partsupp.ps_partkey = part.p_partkey)) otherCondition=() build RFs:RF3 p_partkey->[ps_partkey] -------------------hashJoin[INNER_JOIN] hashCondition=((t2.l_partkey = t1.ps_partkey) and (t2.l_suppkey = t1.ps_suppkey)) otherCondition=((cast(ps_availqty as DECIMALV3(38, 3)) > t2.l_q)) build RFs:RF1 l_partkey->[ps_partkey];RF2 l_suppkey->[ps_suppkey] ---------------------PhysicalProject -----------------------PhysicalOlapScan[partsupp] apply RFs: RF1 RF2 RF3 RF4 ---------------------PhysicalDistribute +----------hashJoin[INNER_JOIN] hashCondition=((supplier.s_nationkey = nation.n_nationkey)) otherCondition=() build RFs:RF4 n_nationkey->[s_nationkey] +------------PhysicalProject +--------------hashJoin[RIGHT_SEMI_JOIN] hashCondition=((supplier.s_suppkey = t3.ps_suppkey)) otherCondition=() build RFs:RF3 s_suppkey->[ps_suppkey] +----------------PhysicalDistribute +------------------PhysicalProject +--------------------hashJoin[INNER_JOIN] hashCondition=((t2.l_partkey = t1.ps_partkey) and (t2.l_suppkey = t1.ps_suppkey)) otherCondition=((cast(ps_availqty as DECIMALV3(38, 3)) > t2.l_q)) build RFs:RF1 ps_partkey->[l_partkey];RF2 ps_suppkey->[l_suppkey] ----------------------PhysicalProject ------------------------hashAgg[GLOBAL] --------------------------PhysicalDistribute ----------------------------hashAgg[LOCAL] ------------------------------PhysicalProject --------------------------------filter((lineitem.l_shipdate < '1995-01-01') and (lineitem.l_shipdate >= '1994-01-01')) -----------------------------------PhysicalOlapScan[lineitem] -------------------PhysicalProject ---------------------filter((p_name like 'forest%')) -----------------------PhysicalOlapScan[part] -------------PhysicalDistribute ---------------hashJoin[INNER_JOIN] hashCondition=((supplier.s_nationkey = nation.n_nationkey)) otherCondition=() build RFs:RF0 n_nationkey->[s_nationkey] -----------------PhysicalProject -------------------PhysicalOlapScan[supplier] apply RFs: RF0 +----------------------------------PhysicalOlapScan[lineitem] apply RFs: RF1 RF2 +----------------------PhysicalDistribute +------------------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((partsupp.ps_partkey = part.p_partkey)) otherCondition=() build RFs:RF0 p_partkey->[ps_partkey] +--------------------------PhysicalProject +----------------------------PhysicalOlapScan[partsupp] apply RFs: RF0 RF3 +--------------------------PhysicalProject +----------------------------filter((p_name like 'forest%')) +------------------------------PhysicalOlapScan[part] ----------------PhysicalDistribute ------------------PhysicalProject ---------------------filter((nation.n_name = 'CANADA')) -----------------------PhysicalOlapScan[nation] +--------------------PhysicalOlapScan[supplier] apply RFs: RF4 +------------PhysicalDistribute +--------------PhysicalProject +----------------filter((nation.n_name = 'CANADA')) +------------------PhysicalOlapScan[nation] diff --git a/regression-test/data/nereids_tpch_shape_sf1000_p0/nostats_rf_prune/q20.out b/regression-test/data/nereids_tpch_shape_sf1000_p0/nostats_rf_prune/q20.out index a7838d0450..1292e0468f 100644 --- a/regression-test/data/nereids_tpch_shape_sf1000_p0/nostats_rf_prune/q20.out +++ b/regression-test/data/nereids_tpch_shape_sf1000_p0/nostats_rf_prune/q20.out @@ -5,29 +5,30 @@ PhysicalResultSink ----PhysicalDistribute ------PhysicalQuickSort[LOCAL_SORT] --------PhysicalProject -----------hashJoin[RIGHT_SEMI_JOIN] hashCondition=((supplier.s_suppkey = partsupp.ps_suppkey)) otherCondition=() build RFs:RF4 s_suppkey->[ps_suppkey] -------------PhysicalDistribute ---------------PhysicalProject -----------------hashJoin[INNER_JOIN] hashCondition=((lineitem.l_partkey = partsupp.ps_partkey) and (lineitem.l_suppkey = partsupp.ps_suppkey)) otherCondition=((cast(ps_availqty as DECIMALV3(38, 3)) > (0.5 * sum(l_quantity)))) build RFs:RF2 l_partkey->[ps_partkey];RF3 l_suppkey->[ps_suppkey] -------------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((partsupp.ps_partkey = part.p_partkey)) otherCondition=() build RFs:RF1 p_partkey->[ps_partkey] ---------------------PhysicalProject -----------------------PhysicalOlapScan[partsupp] apply RFs: RF1 RF2 RF3 RF4 ---------------------PhysicalProject -----------------------filter((p_name like 'forest%')) -------------------------PhysicalOlapScan[part] -------------------PhysicalDistribute ---------------------hashAgg[GLOBAL] -----------------------PhysicalDistribute -------------------------hashAgg[LOCAL] ---------------------------PhysicalProject -----------------------------filter((lineitem.l_shipdate < '1995-01-01') and (lineitem.l_shipdate >= '1994-01-01')) -------------------------------PhysicalOlapScan[lineitem] -------------PhysicalDistribute ---------------hashJoin[INNER_JOIN] hashCondition=((supplier.s_nationkey = nation.n_nationkey)) otherCondition=() build RFs:RF0 n_nationkey->[s_nationkey] -----------------PhysicalProject -------------------PhysicalOlapScan[supplier] apply RFs: RF0 +----------hashJoin[INNER_JOIN] hashCondition=((supplier.s_nationkey = nation.n_nationkey)) otherCondition=() build RFs:RF4 n_nationkey->[s_nationkey] +------------PhysicalProject +--------------hashJoin[RIGHT_SEMI_JOIN] hashCondition=((supplier.s_suppkey = partsupp.ps_suppkey)) otherCondition=() build RFs:RF3 s_suppkey->[ps_suppkey] ----------------PhysicalDistribute ------------------PhysicalProject ---------------------filter((nation.n_name = 'CANADA')) -----------------------PhysicalOlapScan[nation] +--------------------hashJoin[INNER_JOIN] hashCondition=((lineitem.l_partkey = partsupp.ps_partkey) and (lineitem.l_suppkey = partsupp.ps_suppkey)) otherCondition=((cast(ps_availqty as DECIMALV3(38, 3)) > (0.5 * sum(l_quantity)))) build RFs:RF1 ps_partkey->[l_partkey];RF2 ps_suppkey->[l_suppkey] +----------------------hashAgg[GLOBAL] +------------------------PhysicalDistribute +--------------------------hashAgg[LOCAL] +----------------------------PhysicalProject +------------------------------filter((lineitem.l_shipdate < '1995-01-01') and (lineitem.l_shipdate >= '1994-01-01')) +--------------------------------PhysicalOlapScan[lineitem] apply RFs: RF1 RF2 +----------------------PhysicalDistribute +------------------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((partsupp.ps_partkey = part.p_partkey)) otherCondition=() build RFs:RF0 p_partkey->[ps_partkey] +--------------------------PhysicalProject +----------------------------PhysicalOlapScan[partsupp] apply RFs: RF0 RF3 +--------------------------PhysicalProject +----------------------------filter((p_name like 'forest%')) +------------------------------PhysicalOlapScan[part] +----------------PhysicalDistribute +------------------PhysicalProject +--------------------PhysicalOlapScan[supplier] apply RFs: RF4 +------------PhysicalDistribute +--------------PhysicalProject +----------------filter((nation.n_name = 'CANADA')) +------------------PhysicalOlapScan[nation] diff --git a/regression-test/data/nereids_tpch_shape_sf1000_p0/nostats_rf_prune/q21.out b/regression-test/data/nereids_tpch_shape_sf1000_p0/nostats_rf_prune/q21.out index b8378fe828..b248bc83d6 100644 --- a/regression-test/data/nereids_tpch_shape_sf1000_p0/nostats_rf_prune/q21.out +++ b/regression-test/data/nereids_tpch_shape_sf1000_p0/nostats_rf_prune/q21.out @@ -8,27 +8,29 @@ PhysicalResultSink ----------PhysicalDistribute ------------hashAgg[LOCAL] --------------PhysicalProject -----------------hashJoin[LEFT_ANTI_JOIN] hashCondition=((l3.l_orderkey = l1.l_orderkey)) otherCondition=(( not (l_suppkey = l_suppkey))) -------------------hashJoin[RIGHT_SEMI_JOIN] hashCondition=((l2.l_orderkey = l1.l_orderkey)) otherCondition=(( not (l_suppkey = l_suppkey))) build RFs:RF3 l_orderkey->[l_orderkey] ---------------------PhysicalProject -----------------------PhysicalOlapScan[lineitem] apply RFs: RF3 ---------------------hashJoin[INNER_JOIN] hashCondition=((supplier.s_nationkey = nation.n_nationkey)) otherCondition=() build RFs:RF2 n_nationkey->[s_nationkey] -----------------------hashJoin[INNER_JOIN] hashCondition=((orders.o_orderkey = l1.l_orderkey)) otherCondition=() build RFs:RF1 o_orderkey->[l_orderkey] -------------------------hashJoin[INNER_JOIN] hashCondition=((supplier.s_suppkey = l1.l_suppkey)) otherCondition=() build RFs:RF0 s_suppkey->[l_suppkey] ---------------------------PhysicalProject -----------------------------filter((l1.l_receiptdate > l1.l_commitdate)) -------------------------------PhysicalOlapScan[lineitem] apply RFs: RF0 RF1 ---------------------------PhysicalDistribute +----------------hashJoin[INNER_JOIN] hashCondition=((supplier.s_nationkey = nation.n_nationkey)) otherCondition=() build RFs:RF3 n_nationkey->[s_nationkey] +------------------PhysicalProject +--------------------hashJoin[INNER_JOIN] hashCondition=((supplier.s_suppkey = l1.l_suppkey)) otherCondition=() build RFs:RF2 s_suppkey->[l_suppkey] +----------------------PhysicalDistribute +------------------------hashJoin[INNER_JOIN] hashCondition=((orders.o_orderkey = l1.l_orderkey)) otherCondition=() build RFs:RF1 o_orderkey->[l_orderkey] +--------------------------hashJoin[RIGHT_SEMI_JOIN] hashCondition=((l2.l_orderkey = l1.l_orderkey)) otherCondition=(( not (l_suppkey = l_suppkey))) build RFs:RF0 l_orderkey->[l_orderkey] ----------------------------PhysicalProject -------------------------------PhysicalOlapScan[supplier] apply RFs: RF2 -------------------------PhysicalProject ---------------------------filter((orders.o_orderstatus = 'F')) -----------------------------PhysicalOlapScan[orders] +------------------------------PhysicalOlapScan[lineitem] apply RFs: RF0 +----------------------------hashJoin[LEFT_ANTI_JOIN] hashCondition=((l3.l_orderkey = l1.l_orderkey)) otherCondition=(( not (l_suppkey = l_suppkey))) +------------------------------PhysicalProject +--------------------------------filter((l1.l_receiptdate > l1.l_commitdate)) +----------------------------------PhysicalOlapScan[lineitem] apply RFs: RF1 RF2 +------------------------------PhysicalProject +--------------------------------filter((l3.l_receiptdate > l3.l_commitdate)) +----------------------------------PhysicalOlapScan[lineitem] +--------------------------PhysicalProject +----------------------------filter((orders.o_orderstatus = 'F')) +------------------------------PhysicalOlapScan[orders] ----------------------PhysicalDistribute ------------------------PhysicalProject ---------------------------filter((nation.n_name = 'SAUDI ARABIA')) -----------------------------PhysicalOlapScan[nation] -------------------PhysicalProject ---------------------filter((l3.l_receiptdate > l3.l_commitdate)) -----------------------PhysicalOlapScan[lineitem] +--------------------------PhysicalOlapScan[supplier] apply RFs: RF3 +------------------PhysicalDistribute +--------------------PhysicalProject +----------------------filter((nation.n_name = 'SAUDI ARABIA')) +------------------------PhysicalOlapScan[nation] diff --git a/regression-test/data/nereids_tpch_shape_sf1000_p0/shape_no_stats/q20-rewrite.out b/regression-test/data/nereids_tpch_shape_sf1000_p0/shape_no_stats/q20-rewrite.out index 7c1b5920e4..525bc7be55 100644 --- a/regression-test/data/nereids_tpch_shape_sf1000_p0/shape_no_stats/q20-rewrite.out +++ b/regression-test/data/nereids_tpch_shape_sf1000_p0/shape_no_stats/q20-rewrite.out @@ -5,30 +5,31 @@ PhysicalResultSink ----PhysicalDistribute ------PhysicalQuickSort[LOCAL_SORT] --------PhysicalProject -----------hashJoin[RIGHT_SEMI_JOIN] hashCondition=((supplier.s_suppkey = t3.ps_suppkey)) otherCondition=() build RFs:RF4 s_suppkey->[ps_suppkey] -------------PhysicalDistribute ---------------PhysicalProject -----------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((partsupp.ps_partkey = part.p_partkey)) otherCondition=() build RFs:RF3 p_partkey->[ps_partkey] -------------------hashJoin[INNER_JOIN] hashCondition=((t2.l_partkey = t1.ps_partkey) and (t2.l_suppkey = t1.ps_suppkey)) otherCondition=((cast(ps_availqty as DECIMALV3(38, 3)) > t2.l_q)) build RFs:RF1 l_partkey->[ps_partkey];RF2 l_suppkey->[ps_suppkey] ---------------------PhysicalProject -----------------------PhysicalOlapScan[partsupp] apply RFs: RF1 RF2 RF3 RF4 ---------------------PhysicalDistribute +----------hashJoin[INNER_JOIN] hashCondition=((supplier.s_nationkey = nation.n_nationkey)) otherCondition=() build RFs:RF4 n_nationkey->[s_nationkey] +------------PhysicalProject +--------------hashJoin[RIGHT_SEMI_JOIN] hashCondition=((supplier.s_suppkey = t3.ps_suppkey)) otherCondition=() build RFs:RF3 s_suppkey->[ps_suppkey] +----------------PhysicalDistribute +------------------PhysicalProject +--------------------hashJoin[INNER_JOIN] hashCondition=((t2.l_partkey = t1.ps_partkey) and (t2.l_suppkey = t1.ps_suppkey)) otherCondition=((cast(ps_availqty as DECIMALV3(38, 3)) > t2.l_q)) build RFs:RF1 ps_partkey->[l_partkey];RF2 ps_suppkey->[l_suppkey] ----------------------PhysicalProject ------------------------hashAgg[GLOBAL] --------------------------PhysicalDistribute ----------------------------hashAgg[LOCAL] ------------------------------PhysicalProject --------------------------------filter((lineitem.l_shipdate < '1995-01-01') and (lineitem.l_shipdate >= '1994-01-01')) -----------------------------------PhysicalOlapScan[lineitem] -------------------PhysicalProject ---------------------filter((p_name like 'forest%')) -----------------------PhysicalOlapScan[part] -------------PhysicalDistribute ---------------hashJoin[INNER_JOIN] hashCondition=((supplier.s_nationkey = nation.n_nationkey)) otherCondition=() build RFs:RF0 n_nationkey->[s_nationkey] -----------------PhysicalProject -------------------PhysicalOlapScan[supplier] apply RFs: RF0 +----------------------------------PhysicalOlapScan[lineitem] apply RFs: RF1 RF2 +----------------------PhysicalDistribute +------------------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((partsupp.ps_partkey = part.p_partkey)) otherCondition=() build RFs:RF0 p_partkey->[ps_partkey] +--------------------------PhysicalProject +----------------------------PhysicalOlapScan[partsupp] apply RFs: RF0 RF3 +--------------------------PhysicalProject +----------------------------filter((p_name like 'forest%')) +------------------------------PhysicalOlapScan[part] ----------------PhysicalDistribute ------------------PhysicalProject ---------------------filter((nation.n_name = 'CANADA')) -----------------------PhysicalOlapScan[nation] +--------------------PhysicalOlapScan[supplier] apply RFs: RF4 +------------PhysicalDistribute +--------------PhysicalProject +----------------filter((nation.n_name = 'CANADA')) +------------------PhysicalOlapScan[nation] diff --git a/regression-test/data/nereids_tpch_shape_sf1000_p0/shape_no_stats/q20.out b/regression-test/data/nereids_tpch_shape_sf1000_p0/shape_no_stats/q20.out index a7838d0450..1292e0468f 100644 --- a/regression-test/data/nereids_tpch_shape_sf1000_p0/shape_no_stats/q20.out +++ b/regression-test/data/nereids_tpch_shape_sf1000_p0/shape_no_stats/q20.out @@ -5,29 +5,30 @@ PhysicalResultSink ----PhysicalDistribute ------PhysicalQuickSort[LOCAL_SORT] --------PhysicalProject -----------hashJoin[RIGHT_SEMI_JOIN] hashCondition=((supplier.s_suppkey = partsupp.ps_suppkey)) otherCondition=() build RFs:RF4 s_suppkey->[ps_suppkey] -------------PhysicalDistribute ---------------PhysicalProject -----------------hashJoin[INNER_JOIN] hashCondition=((lineitem.l_partkey = partsupp.ps_partkey) and (lineitem.l_suppkey = partsupp.ps_suppkey)) otherCondition=((cast(ps_availqty as DECIMALV3(38, 3)) > (0.5 * sum(l_quantity)))) build RFs:RF2 l_partkey->[ps_partkey];RF3 l_suppkey->[ps_suppkey] -------------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((partsupp.ps_partkey = part.p_partkey)) otherCondition=() build RFs:RF1 p_partkey->[ps_partkey] ---------------------PhysicalProject -----------------------PhysicalOlapScan[partsupp] apply RFs: RF1 RF2 RF3 RF4 ---------------------PhysicalProject -----------------------filter((p_name like 'forest%')) -------------------------PhysicalOlapScan[part] -------------------PhysicalDistribute ---------------------hashAgg[GLOBAL] -----------------------PhysicalDistribute -------------------------hashAgg[LOCAL] ---------------------------PhysicalProject -----------------------------filter((lineitem.l_shipdate < '1995-01-01') and (lineitem.l_shipdate >= '1994-01-01')) -------------------------------PhysicalOlapScan[lineitem] -------------PhysicalDistribute ---------------hashJoin[INNER_JOIN] hashCondition=((supplier.s_nationkey = nation.n_nationkey)) otherCondition=() build RFs:RF0 n_nationkey->[s_nationkey] -----------------PhysicalProject -------------------PhysicalOlapScan[supplier] apply RFs: RF0 +----------hashJoin[INNER_JOIN] hashCondition=((supplier.s_nationkey = nation.n_nationkey)) otherCondition=() build RFs:RF4 n_nationkey->[s_nationkey] +------------PhysicalProject +--------------hashJoin[RIGHT_SEMI_JOIN] hashCondition=((supplier.s_suppkey = partsupp.ps_suppkey)) otherCondition=() build RFs:RF3 s_suppkey->[ps_suppkey] ----------------PhysicalDistribute ------------------PhysicalProject ---------------------filter((nation.n_name = 'CANADA')) -----------------------PhysicalOlapScan[nation] +--------------------hashJoin[INNER_JOIN] hashCondition=((lineitem.l_partkey = partsupp.ps_partkey) and (lineitem.l_suppkey = partsupp.ps_suppkey)) otherCondition=((cast(ps_availqty as DECIMALV3(38, 3)) > (0.5 * sum(l_quantity)))) build RFs:RF1 ps_partkey->[l_partkey];RF2 ps_suppkey->[l_suppkey] +----------------------hashAgg[GLOBAL] +------------------------PhysicalDistribute +--------------------------hashAgg[LOCAL] +----------------------------PhysicalProject +------------------------------filter((lineitem.l_shipdate < '1995-01-01') and (lineitem.l_shipdate >= '1994-01-01')) +--------------------------------PhysicalOlapScan[lineitem] apply RFs: RF1 RF2 +----------------------PhysicalDistribute +------------------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((partsupp.ps_partkey = part.p_partkey)) otherCondition=() build RFs:RF0 p_partkey->[ps_partkey] +--------------------------PhysicalProject +----------------------------PhysicalOlapScan[partsupp] apply RFs: RF0 RF3 +--------------------------PhysicalProject +----------------------------filter((p_name like 'forest%')) +------------------------------PhysicalOlapScan[part] +----------------PhysicalDistribute +------------------PhysicalProject +--------------------PhysicalOlapScan[supplier] apply RFs: RF4 +------------PhysicalDistribute +--------------PhysicalProject +----------------filter((nation.n_name = 'CANADA')) +------------------PhysicalOlapScan[nation] diff --git a/regression-test/data/nereids_tpch_shape_sf1000_p0/shape_no_stats/q21.out b/regression-test/data/nereids_tpch_shape_sf1000_p0/shape_no_stats/q21.out index b8378fe828..b248bc83d6 100644 --- a/regression-test/data/nereids_tpch_shape_sf1000_p0/shape_no_stats/q21.out +++ b/regression-test/data/nereids_tpch_shape_sf1000_p0/shape_no_stats/q21.out @@ -8,27 +8,29 @@ PhysicalResultSink ----------PhysicalDistribute ------------hashAgg[LOCAL] --------------PhysicalProject -----------------hashJoin[LEFT_ANTI_JOIN] hashCondition=((l3.l_orderkey = l1.l_orderkey)) otherCondition=(( not (l_suppkey = l_suppkey))) -------------------hashJoin[RIGHT_SEMI_JOIN] hashCondition=((l2.l_orderkey = l1.l_orderkey)) otherCondition=(( not (l_suppkey = l_suppkey))) build RFs:RF3 l_orderkey->[l_orderkey] ---------------------PhysicalProject -----------------------PhysicalOlapScan[lineitem] apply RFs: RF3 ---------------------hashJoin[INNER_JOIN] hashCondition=((supplier.s_nationkey = nation.n_nationkey)) otherCondition=() build RFs:RF2 n_nationkey->[s_nationkey] -----------------------hashJoin[INNER_JOIN] hashCondition=((orders.o_orderkey = l1.l_orderkey)) otherCondition=() build RFs:RF1 o_orderkey->[l_orderkey] -------------------------hashJoin[INNER_JOIN] hashCondition=((supplier.s_suppkey = l1.l_suppkey)) otherCondition=() build RFs:RF0 s_suppkey->[l_suppkey] ---------------------------PhysicalProject -----------------------------filter((l1.l_receiptdate > l1.l_commitdate)) -------------------------------PhysicalOlapScan[lineitem] apply RFs: RF0 RF1 ---------------------------PhysicalDistribute +----------------hashJoin[INNER_JOIN] hashCondition=((supplier.s_nationkey = nation.n_nationkey)) otherCondition=() build RFs:RF3 n_nationkey->[s_nationkey] +------------------PhysicalProject +--------------------hashJoin[INNER_JOIN] hashCondition=((supplier.s_suppkey = l1.l_suppkey)) otherCondition=() build RFs:RF2 s_suppkey->[l_suppkey] +----------------------PhysicalDistribute +------------------------hashJoin[INNER_JOIN] hashCondition=((orders.o_orderkey = l1.l_orderkey)) otherCondition=() build RFs:RF1 o_orderkey->[l_orderkey] +--------------------------hashJoin[RIGHT_SEMI_JOIN] hashCondition=((l2.l_orderkey = l1.l_orderkey)) otherCondition=(( not (l_suppkey = l_suppkey))) build RFs:RF0 l_orderkey->[l_orderkey] ----------------------------PhysicalProject -------------------------------PhysicalOlapScan[supplier] apply RFs: RF2 -------------------------PhysicalProject ---------------------------filter((orders.o_orderstatus = 'F')) -----------------------------PhysicalOlapScan[orders] +------------------------------PhysicalOlapScan[lineitem] apply RFs: RF0 +----------------------------hashJoin[LEFT_ANTI_JOIN] hashCondition=((l3.l_orderkey = l1.l_orderkey)) otherCondition=(( not (l_suppkey = l_suppkey))) +------------------------------PhysicalProject +--------------------------------filter((l1.l_receiptdate > l1.l_commitdate)) +----------------------------------PhysicalOlapScan[lineitem] apply RFs: RF1 RF2 +------------------------------PhysicalProject +--------------------------------filter((l3.l_receiptdate > l3.l_commitdate)) +----------------------------------PhysicalOlapScan[lineitem] +--------------------------PhysicalProject +----------------------------filter((orders.o_orderstatus = 'F')) +------------------------------PhysicalOlapScan[orders] ----------------------PhysicalDistribute ------------------------PhysicalProject ---------------------------filter((nation.n_name = 'SAUDI ARABIA')) -----------------------------PhysicalOlapScan[nation] -------------------PhysicalProject ---------------------filter((l3.l_receiptdate > l3.l_commitdate)) -----------------------PhysicalOlapScan[lineitem] +--------------------------PhysicalOlapScan[supplier] apply RFs: RF3 +------------------PhysicalDistribute +--------------------PhysicalProject +----------------------filter((nation.n_name = 'SAUDI ARABIA')) +------------------------PhysicalOlapScan[nation] diff --git a/regression-test/suites/nereids_rules_p0/transposeJoin/transposeSemiJoinAgg.groovy b/regression-test/suites/nereids_rules_p0/transposeJoin/transposeSemiJoinAgg.groovy index ea557f0de7..c3370019cb 100644 --- a/regression-test/suites/nereids_rules_p0/transposeJoin/transposeSemiJoinAgg.groovy +++ b/regression-test/suites/nereids_rules_p0/transposeJoin/transposeSemiJoinAgg.groovy @@ -22,6 +22,7 @@ suite("transposeSemiJoinAgg") { sql "SET enable_nereids_planner=true" sql "SET enable_fallback_to_original_planner=false" sql "set partition_pruning_expand_threshold=10;" + sql "set ignore_shape_nodes='PhysicalDistribute,PhysicalProject'" sql "drop table if exists T1;" sql """ CREATE TABLE T1 (