[opt](nereids) use 2 phase agg above union all (#26245)

forbid one phase agg for pattern: agg-unionAll
one phase agg plan: agg-union-hashDistribute-children
two phase agg plan: agg(global) - hashDistribute-agg(local)-union-randomDistribute
the key point is the cost of randomDistribute is much lower than the hashDistribute, and hence two-phase agg wins.
This commit is contained in:
minghong
2023-11-08 17:15:53 +08:00
committed by GitHub
parent 96d2e3394a
commit a6d2013802
11 changed files with 314 additions and 234 deletions

View File

@ -56,6 +56,7 @@ class CostModelV1 extends PlanVisitor<Cost, PlanContext> {
// the penalty factor is no more than BROADCAST_JOIN_SKEW_PENALTY_LIMIT
static final double BROADCAST_JOIN_SKEW_RATIO = 30.0;
static final double BROADCAST_JOIN_SKEW_PENALTY_LIMIT = 2.0;
static final double RANDOM_SHUFFLE_TO_HASH_SHUFFLE_FACTOR = 0.1;
private final int beNumber;
public CostModelV1(ConnectContext connectContext) {
@ -217,10 +218,11 @@ class CostModelV1 extends PlanVisitor<Cost, PlanContext> {
}
// any
// cost of randome shuffle is lower than hash shuffle.
return CostV1.of(context.getSessionVariable(),
intputRowCount,
0,
0);
0,
intputRowCount * childStatistics.dataSizeFactor() * RANDOM_SHUFFLE_TO_HASH_SHUFFLE_FACTOR / beNumber);
}
@Override

View File

@ -42,6 +42,7 @@ import org.apache.doris.nereids.trees.plans.physical.PhysicalNestedLoopJoin;
import org.apache.doris.nereids.trees.plans.physical.PhysicalPartitionTopN;
import org.apache.doris.nereids.trees.plans.physical.PhysicalProject;
import org.apache.doris.nereids.trees.plans.physical.PhysicalSetOperation;
import org.apache.doris.nereids.trees.plans.physical.PhysicalUnion;
import org.apache.doris.nereids.trees.plans.visitor.PlanVisitor;
import org.apache.doris.nereids.util.JoinUtils;
import org.apache.doris.qe.ConnectContext;
@ -117,6 +118,15 @@ public class ChildrenPropertiesRegulator extends PlanVisitor<Boolean, Void> {
&& children.get(0).getPlan() instanceof PhysicalDistribute) {
return false;
}
// agg(group by x)-union all(A, B)
// no matter x.ndv is high or not, it is not worthwhile to shuffle A and B by x
// and hence we forbid one phase agg
if (agg.getAggMode() == AggMode.INPUT_TO_RESULT
&& children.get(0).getPlan() instanceof PhysicalUnion
&& !((PhysicalUnion) children.get(0).getPlan()).isDistinct()) {
return false;
}
// forbid multi distinct opt that bad than multi-stage version when multi-stage can be executed in one fragment
if (agg.getAggMode() == AggMode.INPUT_TO_BUFFER || agg.getAggMode() == AggMode.INPUT_TO_RESULT) {
List<MultiDistinction> multiDistinctions = agg.getOutputExpressions().stream()

View File

@ -194,4 +194,8 @@ public abstract class PhysicalSetOperation extends AbstractPhysicalPlan implemen
.map(NamedExpression::toSlot)
.collect(ImmutableList.toImmutableList());
}
public boolean isDistinct() {
return qualifier == Qualifier.DISTINCT;
}
}

View File

@ -40,7 +40,7 @@ import java.util.Set;
public class ColumnStatistic {
public static final double STATS_ERROR = 0.1D;
public static final double ALMOST_UNIQUE_FACTOR = 0.9;
public static final StatsType NDV = StatsType.NDV;
public static final StatsType AVG_SIZE = StatsType.AVG_SIZE;
public static final StatsType MAX_SIZE = StatsType.MAX_SIZE;
@ -211,7 +211,7 @@ public class ColumnStatistic {
}
public static boolean isAlmostUnique(double ndv, double rowCount) {
return rowCount * 0.9 < ndv && ndv < rowCount * 1.1;
return rowCount * ALMOST_UNIQUE_FACTOR < ndv;
}
public ColumnStatistic updateByLimit(long limit, double rowCount) {