[opt](nereids) if column stats are unknown, 10-20 table-join optimization use cascading instead of dphyp (#29902)

* if column stats are unknown, do not use dphyp
tpcds query64 is optimized in case of no stats
sf500, query64 improved from 15sec to 7sec on hdfs, and from 4sec to 3.85sec on olaptable
This commit is contained in:
minghong
2024-01-16 15:22:36 +08:00
committed by yiguolei
parent e1bcdc35fd
commit 22978726e3
5 changed files with 195 additions and 171 deletions

View File

@ -74,6 +74,13 @@ public class StatementContext {
private boolean isDpHyp = false;
private boolean isOtherJoinReorder = false;
// hasUnknownColStats true if any column stats in the tables used by this sql is unknown
// the algorithm to derive plan when column stats are unknown is implemented in cascading framework, not in dphyper.
// And hence, when column stats are unknown, even if the tables used by a sql is more than
// MAX_TABLE_COUNT_USE_CASCADES_JOIN_REORDER, join reorder should choose cascading framework.
// Thus hasUnknownColStats has higher priority than isDpHyp
private boolean hasUnknownColStats = false;
private final IdGenerator<ExprId> exprIdGenerator = ExprId.createGenerator();
private final IdGenerator<ObjectId> objectIdGenerator = ObjectId.createGenerator();
private final IdGenerator<RelationId> relationIdGenerator = RelationId.createGenerator();
@ -261,4 +268,12 @@ public class StatementContext {
public void addJoinFilters(Collection<Expression> newJoinFilters) {
this.joinFilters.addAll(newJoinFilters);
}
public boolean isHasUnknownColStats() {
return hasUnknownColStats;
}
public void setHasUnknownColStats(boolean hasUnknownColStats) {
this.hasUnknownColStats = hasUnknownColStats;
}
}

View File

@ -22,6 +22,7 @@ import org.apache.doris.nereids.jobs.cascades.DeriveStatsJob;
import org.apache.doris.nereids.jobs.cascades.OptimizeGroupJob;
import org.apache.doris.nereids.jobs.joinorder.JoinOrderJob;
import org.apache.doris.nereids.memo.Group;
import org.apache.doris.qe.ConnectContext;
import org.apache.doris.qe.SessionVariable;
import java.util.Objects;
@ -49,11 +50,22 @@ public class Optimizer {
cascadesContext.pushJob(new DeriveStatsJob(cascadesContext.getMemo().getRoot().getLogicalExpression(),
cascadesContext.getCurrentJobContext()));
cascadesContext.getJobScheduler().executeJobPool(cascadesContext);
boolean optimizeWithUnknownColStats = false;
if (ConnectContext.get() != null && ConnectContext.get().getStatementContext() != null) {
if (ConnectContext.get().getStatementContext().isHasUnknownColStats()) {
optimizeWithUnknownColStats = true;
}
}
// DPHyp optimize
int maxTableCount = getSessionVariable().getMaxTableCountUseCascadesJoinReorder();
if (optimizeWithUnknownColStats) {
// if column stats are unknown, 10~20 table-join is optimized by cascading framework
maxTableCount = 2 * maxTableCount;
}
int maxJoinCount = cascadesContext.getMemo().countMaxContinuousJoin();
cascadesContext.getStatementContext().setMaxContinuousJoin(maxJoinCount);
boolean isDpHyp = getSessionVariable().enableDPHypOptimizer
|| maxJoinCount > getSessionVariable().getMaxTableCountUseCascadesJoinReorder();
|| maxJoinCount > maxTableCount;
cascadesContext.getStatementContext().setDpHyp(isDpHyp);
cascadesContext.getStatementContext().setOtherJoinReorder(false);
if (!getSessionVariable().isDisableJoinReorder() && isDpHyp

View File

@ -623,6 +623,7 @@ public class StatsCalculator extends DefaultPlanVisitor<Statistics, Void> {
Map<Expression, ColumnStatistic> columnStatisticMap = new HashMap<>();
TableIf table = catalogRelation.getTable();
double rowCount = catalogRelation.getTable().estimatedRowCount();
boolean hasUnknownCol = false;
for (SlotReference slotReference : slotSet) {
String colName = slotReference.getName();
boolean shouldIgnoreThisCol = StatisticConstants.shouldIgnoreCol(table, slotReference.getColumn().get());
@ -644,13 +645,19 @@ public class StatsCalculator extends DefaultPlanVisitor<Statistics, Void> {
}
if (!cache.isUnKnown) {
rowCount = Math.max(rowCount, cache.count);
} else {
hasUnknownCol = true;
}
if (ConnectContext.get() != null && ConnectContext.get().getSessionVariable().enableStats) {
columnStatisticMap.put(slotReference, cache);
} else {
columnStatisticMap.put(slotReference, ColumnStatistic.UNKNOWN);
hasUnknownCol = true;
}
}
if (hasUnknownCol && ConnectContext.get() != null && ConnectContext.get().getStatementContext() != null) {
ConnectContext.get().getStatementContext().setHasUnknownColStats(true);
}
Statistics stats = new Statistics(rowCount, columnStatisticMap);
stats = normalizeCatalogRelationColumnStatsRowCount(stats);
return stats;