[opt](Nereids) improve semi/anti join estimation when column stats are unavailable #27793
this change improves performance of tpch q20. on sf500, improved from 6.3sec to 1.1 sec this change has no impaction on tpcds when column stats is unknown, the basic algorithm to estimate left semi join output row count is its left child output row count. q1: "A left semi join B on A.x=B.x" the output row is estimated as A.rowCount. But the basic algorithm is not good to following pattern: q2: "A left semi join filter(B) on A.x=B.x" Because there is a filter on B, usually this left semi join also reduce the row count of A, and we estimate the output of q2 as A.rowCount * Filter.rowCount/B.rowCount
This commit is contained in:
@ -20,6 +20,7 @@ package org.apache.doris.nereids.stats;
|
||||
import org.apache.doris.nereids.exceptions.AnalysisException;
|
||||
import org.apache.doris.nereids.trees.expressions.Cast;
|
||||
import org.apache.doris.nereids.trees.expressions.EqualPredicate;
|
||||
import org.apache.doris.nereids.trees.expressions.EqualTo;
|
||||
import org.apache.doris.nereids.trees.expressions.Expression;
|
||||
import org.apache.doris.nereids.trees.expressions.Slot;
|
||||
import org.apache.doris.nereids.trees.plans.JoinType;
|
||||
@ -44,6 +45,7 @@ import java.util.stream.Collectors;
|
||||
*/
|
||||
public class JoinEstimation {
|
||||
private static double DEFAULT_ANTI_JOIN_SELECTIVITY_COEFFICIENT = 0.3;
|
||||
private static double UNKNOWN_COL_STATS_FILTER_SEL_LOWER_BOUND = 0.5;
|
||||
|
||||
private static EqualPredicate normalizeHashJoinCondition(EqualPredicate equal, Statistics leftStats,
|
||||
Statistics rightStats) {
|
||||
@ -169,6 +171,27 @@ public class JoinEstimation {
|
||||
.build();
|
||||
}
|
||||
|
||||
private static double computeSelectivityForBuildSideWhenColStatsUnknown(Statistics buildStats, Join join) {
|
||||
double sel = 1.0;
|
||||
for (Expression cond : join.getHashJoinConjuncts()) {
|
||||
if (cond instanceof EqualTo) {
|
||||
EqualTo equal = (EqualTo) cond;
|
||||
if (equal.left() instanceof Slot && equal.right() instanceof Slot) {
|
||||
ColumnStatistic buildColStats = buildStats.findColumnStatistics(equal.left());
|
||||
if (buildColStats == null) {
|
||||
buildColStats = buildStats.findColumnStatistics(equal.right());
|
||||
}
|
||||
if (buildColStats != null) {
|
||||
double buildSel = Math.min(buildStats.getRowCount() / buildColStats.count, 1.0);
|
||||
buildSel = Math.max(buildSel, UNKNOWN_COL_STATS_FILTER_SEL_LOWER_BOUND);
|
||||
sel = Math.min(sel, buildSel);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return sel;
|
||||
}
|
||||
|
||||
private static Statistics estimateInnerJoin(Statistics leftStats, Statistics rightStats, Join join) {
|
||||
if (hashJoinConditionContainsUnknownColumnStats(leftStats, rightStats, join)) {
|
||||
double rowCount = Math.max(leftStats.getRowCount(), rightStats.getRowCount());
|
||||
@ -245,14 +268,15 @@ public class JoinEstimation {
|
||||
|
||||
private static Statistics estimateSemiOrAnti(Statistics leftStats, Statistics rightStats, Join join) {
|
||||
if (hashJoinConditionContainsUnknownColumnStats(leftStats, rightStats, join)) {
|
||||
double sel = computeSelectivityForBuildSideWhenColStatsUnknown(rightStats, join);
|
||||
if (join.getJoinType().isLeftSemiOrAntiJoin()) {
|
||||
return new StatisticsBuilder().setRowCount(leftStats.getRowCount())
|
||||
return new StatisticsBuilder().setRowCount(leftStats.getRowCount() * sel)
|
||||
.putColumnStatistics(leftStats.columnStatistics())
|
||||
.putColumnStatistics(rightStats.columnStatistics())
|
||||
.build();
|
||||
} else {
|
||||
//right semi or anti
|
||||
return new StatisticsBuilder().setRowCount(rightStats.getRowCount())
|
||||
return new StatisticsBuilder().setRowCount(rightStats.getRowCount() * sel)
|
||||
.putColumnStatistics(leftStats.columnStatistics())
|
||||
.putColumnStatistics(rightStats.columnStatistics())
|
||||
.build();
|
||||
|
||||
@ -668,7 +668,20 @@ public class StatsCalculator extends DefaultPlanVisitor<Statistics, Void> {
|
||||
columnStatisticMap.put(slotReference, ColumnStatistic.UNKNOWN);
|
||||
}
|
||||
}
|
||||
return new Statistics(rowCount, columnStatisticMap);
|
||||
Statistics stats = new Statistics(rowCount, columnStatisticMap);
|
||||
stats = normalizeCatalogRelationColumnStatsRowCount(stats);
|
||||
return stats;
|
||||
}
|
||||
|
||||
private Statistics normalizeCatalogRelationColumnStatsRowCount(Statistics stats) {
|
||||
for (Expression slot : stats.columnStatistics().keySet()) {
|
||||
ColumnStatistic colStats = stats.findColumnStatistics(slot);
|
||||
Preconditions.checkArgument(colStats != null,
|
||||
"can not find col stats for %s in table", slot.toSql());
|
||||
stats.addColumnStats(slot,
|
||||
new ColumnStatisticBuilder(colStats).setCount(stats.getRowCount()).build());
|
||||
}
|
||||
return stats;
|
||||
}
|
||||
|
||||
private Statistics computeTopN(TopN topN) {
|
||||
|
||||
@ -228,9 +228,9 @@ public abstract class AbstractPhysicalJoin<
|
||||
@Override
|
||||
public String toString() {
|
||||
List<Object> args = Lists.newArrayList("type", joinType,
|
||||
"stats", statistics,
|
||||
"hashCondition", hashJoinConjuncts,
|
||||
"otherCondition", otherJoinConjuncts,
|
||||
"stats", statistics);
|
||||
"otherCondition", otherJoinConjuncts);
|
||||
if (markJoinSlotReference.isPresent()) {
|
||||
args.add("isMarkJoin");
|
||||
args.add("true");
|
||||
|
||||
@ -121,8 +121,9 @@ public class PhysicalOlapScan extends PhysicalCatalogRelation implements OlapSca
|
||||
@Override
|
||||
public String toString() {
|
||||
return Utils.toSqlString("PhysicalOlapScan[" + id.asInt() + "]" + getGroupIdWithPrefix(),
|
||||
"qualified", Utils.qualifiedName(qualifier, table.getName()),
|
||||
"stats", statistics, "fr", getMutableState(AbstractPlan.FRAGMENT_ID)
|
||||
"table", table.getName(),
|
||||
"stats", statistics,
|
||||
"fr", getMutableState(AbstractPlan.FRAGMENT_ID)
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@ -298,7 +298,8 @@ public class ColumnStatistic {
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return isUnKnown ? "unknown" : String.format("ndv=%.4f, min=%f(%s), max=%f(%s), count=%.4f, avgSizeByte=%f",
|
||||
return isUnKnown ? "unknown(" + count + ")"
|
||||
: String.format("ndv=%.4f, min=%f(%s), max=%f(%s), count=%.4f, avgSizeByte=%f",
|
||||
ndv, minValue, minExpr, maxValue, maxExpr, count, avgSizeByte);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user