[enhancement](Nereids): remove stats derivation in CostAndEnforce job (#24945)

1. remove stats derivation in CostAndEnforce job
2. enforce valid for each stats after estimating
This commit is contained in:
谢健
2023-09-27 16:31:03 +08:00
committed by GitHub
parent 5fc04b6aeb
commit 9562e280af
9 changed files with 102 additions and 99 deletions

View File

@ -88,7 +88,7 @@ public class ApplyRuleJob extends Job {
} else {
// The Join Commute rule preserves the operator's expression and children,
// thereby not altering the statistics. Hence, there is no need to derive statistics for it.
groupExpression.setStatDerived(true);
newGroupExpression.setStatDerived(true);
}
} else {
pushJob(new CostAndEnforcerJob(newGroupExpression, context));
@ -100,7 +100,7 @@ public class ApplyRuleJob extends Job {
// These implementation rules integrate rules for plan shape transformation.
pushJob(new DeriveStatsJob(newGroupExpression, context));
} else {
groupExpression.setStatDerived(true);
newGroupExpression.setStatDerived(true);
}
}

View File

@ -31,7 +31,6 @@ import org.apache.doris.nereids.properties.ChildrenPropertiesRegulator;
import org.apache.doris.nereids.properties.EnforceMissingPropertiesHelper;
import org.apache.doris.nereids.properties.PhysicalProperties;
import org.apache.doris.nereids.properties.RequestPropertyDeriver;
import org.apache.doris.nereids.stats.StatsCalculator;
import com.google.common.collect.Lists;
import org.apache.logging.log4j.LogManager;
@ -236,25 +235,13 @@ public class CostAndEnforcerJob extends Job implements Cloneable {
PhysicalProperties outputProperty = childOutputPropertyDeriver.getOutputProperties(groupExpression);
// update current group statistics and re-compute costs.
if (groupExpression.children().stream().anyMatch(group -> group.getStatistics() == null)) {
// TODO: If it's error, add some warning log at least.
if (groupExpression.children().stream().anyMatch(group -> group.getStatistics() == null)
&& groupExpression.getOwnerGroup().getStatistics() == null) {
// if we come here, mean that we have some error in stats calculator and should fix it.
LOG.warn("Nereids try to calculate cost without stats for group expression {}", groupExpression);
return false;
}
StatsCalculator statsCalculator = StatsCalculator.estimate(groupExpression,
context.getCascadesContext().getConnectContext().getSessionVariable().getForbidUnknownColStats(),
context.getCascadesContext().getConnectContext().getTotalColumnStatisticMap(),
context.getCascadesContext().getConnectContext().getSessionVariable().isPlayNereidsDump(),
context.getCascadesContext());
if (!context.getCascadesContext().getConnectContext().getSessionVariable().isPlayNereidsDump()
&& context.getCascadesContext().getConnectContext().getSessionVariable().isEnableMinidump()) {
context.getCascadesContext().getConnectContext().getTotalColumnStatisticMap()
.putAll(statsCalculator.getTotalColumnStatisticMap());
context.getCascadesContext().getConnectContext().getTotalHistogramMap()
.putAll(statsCalculator.getTotalHistogramMap());
}
// recompute cost after adjusting property
curNodeCost = CostCalculator.calculateCost(groupExpression, requestChildrenProperties);
groupExpression.setCost(curNodeCost);

View File

@ -220,6 +220,7 @@ public class StatsCalculator extends DefaultPlanVisitor<Statistics, Void> {
private void estimate() {
Plan plan = groupExpression.getPlan();
Statistics newStats = plan.accept(this, null);
newStats.enforceValid();
// We ensure that the rowCount remains unchanged in order to make the cost of each plan comparable.
if (groupExpression.getOwnerGroup().getStatistics() == null) {
groupExpression.getOwnerGroup().setStatistics(newStats);

View File

@ -123,6 +123,26 @@ public class Statistics {
}
}
public void enforceValid() {
for (Entry<Expression, ColumnStatistic> entry : expressionToColumnStats.entrySet()) {
ColumnStatistic columnStatistic = entry.getValue();
if (!checkColumnStatsValid(columnStatistic)) {
double ndv = Math.min(columnStatistic.ndv, rowCount);
ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder(columnStatistic);
columnStatisticBuilder.setNdv(ndv);
columnStatisticBuilder.setNumNulls(Math.min(columnStatistic.numNulls, rowCount - ndv));
columnStatisticBuilder.setCount(rowCount);
columnStatistic = columnStatisticBuilder.build();
}
expressionToColumnStats.put(entry.getKey(), columnStatistic);
}
}
public boolean checkColumnStatsValid(ColumnStatistic columnStatistic) {
return columnStatistic.ndv <= rowCount
&& columnStatistic.numNulls <= rowCount - columnStatistic.ndv;
}
public Statistics withSel(double sel) {
sel = StatsMathUtil.minNonNaN(sel, 1);
return withRowCount(rowCount * sel);