From c5a53e0caa4424fe56f3766c1407c458b8a7c56a Mon Sep 17 00:00:00 2001 From: minghong Date: Thu, 11 May 2023 19:03:11 +0800 Subject: [PATCH] [tpch](nereids) estimate cost with unknown column stats #19046 make nereids generate more reasonable plans with table row count, but without column stats. TODO: q5 and q7 is not good, because of column correlation ps_suppkey and ps_partkey --- .../doris/nereids/stats/JoinEstimation.java | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java index 7afe007612..ac8545a1bd 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java @@ -21,6 +21,7 @@ import org.apache.doris.common.Pair; import org.apache.doris.nereids.exceptions.AnalysisException; import org.apache.doris.nereids.trees.expressions.EqualTo; import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.expressions.Slot; import org.apache.doris.nereids.trees.plans.JoinType; import org.apache.doris.nereids.trees.plans.algebra.Join; import org.apache.doris.nereids.util.ExpressionUtils; @@ -51,7 +52,32 @@ public class JoinEstimation { } } + private static boolean hashJoinConditionContainsUnknownColumnStats(Statistics leftStats, + Statistics rightStats, Join join) { + for (Expression expr : join.getHashJoinConjuncts()) { + for (Slot slot : expr.getInputSlots()) { + ColumnStatistic colStats = leftStats.findColumnStatistics(slot); + if (colStats == null) { + colStats = rightStats.findColumnStatistics(slot); + } + if (colStats == null || colStats.isUnKnown) { + return true; + } + } + } + return false; + } + private static Statistics estimateInnerJoin(Statistics leftStats, Statistics rightStats, Join join) { + if (hashJoinConditionContainsUnknownColumnStats(leftStats, rightStats, join)) { + double rowCount = Math.max(leftStats.getRowCount(), rightStats.getRowCount()); + rowCount = Math.max(1, rowCount); + return new StatisticsBuilder() + .setRowCount(rowCount) + .putColumnStatistics(leftStats.columnStatistics()) + .putColumnStatistics(rightStats.columnStatistics()) + .build(); + } /* * When we estimate filter A=B, * if any side of equation, A or B, is almost unique, the confidence level of estimation is high. @@ -174,6 +200,20 @@ public class JoinEstimation { } private static Statistics estimateSemiOrAnti(Statistics leftStats, Statistics rightStats, Join join) { + if (hashJoinConditionContainsUnknownColumnStats(leftStats, rightStats, join)) { + if (join.getJoinType().isLeftSemiOrAntiJoin()) { + return new StatisticsBuilder().setRowCount(leftStats.getRowCount()) + .putColumnStatistics(leftStats.columnStatistics()) + .putColumnStatistics(rightStats.columnStatistics()) + .build(); + } else { + //right semi or anti + return new StatisticsBuilder().setRowCount(rightStats.getRowCount()) + .putColumnStatistics(leftStats.columnStatistics()) + .putColumnStatistics(rightStats.columnStatistics()) + .build(); + } + } double rowCount = Double.POSITIVE_INFINITY; for (Expression conjunct : join.getHashJoinConjuncts()) { double eqRowCount = estimateSemiOrAntiRowCountBySlotsEqual(leftStats, rightStats,