[stats](nereids)in predicate range adjust (#22156)

1. refactor in-predicate filter estimation
example: A in (1, 2, 3, 4)
after in-preidcate filter, A.stats.max<=4 and A.stats.min>=1
2. maintain minExpr and maxExpr in in-predicate stats derive
This commit is contained in:
minghong
2023-07-26 19:10:04 +08:00
committed by GitHub
parent 21ea0055fc
commit 490ad14720
2 changed files with 60 additions and 21 deletions

View File

@ -17,6 +17,7 @@
package org.apache.doris.nereids.stats;
import org.apache.doris.analysis.LiteralExpr;
import org.apache.doris.nereids.stats.FilterEstimation.EstimationContext;
import org.apache.doris.nereids.trees.TreeNode;
import org.apache.doris.nereids.trees.expressions.And;
@ -36,6 +37,7 @@ import org.apache.doris.nereids.trees.expressions.Or;
import org.apache.doris.nereids.trees.expressions.Slot;
import org.apache.doris.nereids.trees.expressions.SlotReference;
import org.apache.doris.nereids.trees.expressions.functions.Function;
import org.apache.doris.nereids.trees.expressions.literal.Literal;
import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
import org.apache.doris.statistics.Bucket;
import org.apache.doris.statistics.ColumnStatistic;
@ -266,8 +268,12 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo
return context.statistics.withSel(DEFAULT_IN_COEFFICIENT);
}
List<Expression> options = inPredicate.getOptions();
double maxOption = 0;
double minOption = Double.MAX_VALUE;
// init minOption and maxOption by compareExpr.max and compareExpr.min respectively,
// and then adjust min/max by options
double minOptionValue = compareExprStats.maxValue;
double maxOptionValue = compareExprStats.minValue;
LiteralExpr minOptionLiteral = compareExprStats.maxExpr;
LiteralExpr maxOptionLiteral = compareExprStats.minExpr;
/* suppose A.(min, max) = (0, 10), A.ndv=10
A in ( 1, 2, 5, 100):
validInOptCount = 3, that is (1, 2, 5)
@ -283,32 +289,62 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo
A.(min, max) not changed
A.selectivity = 7/10
*/
double validInOptCount = 0;
int validInOptCount = 0;
double selectivity = 1.0;
ColumnStatisticBuilder compareExprStatsBuilder = new ColumnStatisticBuilder(compareExprStats);
int nonLiteralOptionCount = 0;
for (Expression option : options) {
ColumnStatistic optionStats = ExpressionEstimation.estimate(option, context.statistics);
double validOptionNdv = compareExprStats.ndvIntersection(optionStats);
if (validOptionNdv > 0.0) {
validInOptCount += validOptionNdv;
maxOption = Math.max(optionStats.maxValue, maxOption);
minOption = Math.min(optionStats.minValue, minOption);
if (option instanceof Literal) {
// remove the options which is out of compareExpr.range
if (compareExprStats.minValue <= optionStats.maxValue
&& optionStats.maxValue <= compareExprStats.maxValue) {
validInOptCount++;
LiteralExpr optionLiteralExpr = ((Literal) option).toLegacyLiteral();
if (maxOptionLiteral == null || optionLiteralExpr.compareTo(maxOptionLiteral) >= 0) {
maxOptionLiteral = optionLiteralExpr;
maxOptionValue = optionStats.maxValue;
}
if (minOptionLiteral == null || optionLiteralExpr.compareTo(minOptionLiteral) <= 0) {
minOptionLiteral = optionLiteralExpr;
minOptionValue = optionStats.minValue;
}
}
} else {
nonLiteralOptionCount++;
}
}
if (nonLiteralOptionCount > 0) {
// A in (x+1, ...)
// "x+1" is not literal, and if const-fold can not handle it, it blocks estimation of min/max value.
// and hence, we do not adjust compareExpr.stats.range.
int newNdv = nonLiteralOptionCount + validInOptCount;
if (newNdv < compareExprStats.ndv) {
compareExprStatsBuilder.setNdv(newNdv);
selectivity = StatsMathUtil.divide(newNdv, compareExprStats.ndv);
} else {
selectivity = 1.0;
}
} else {
maxOptionValue = Math.min(maxOptionValue, compareExprStats.maxValue);
minOptionValue = Math.max(minOptionValue, compareExprStats.minValue);
compareExprStatsBuilder.setMaxValue(maxOptionValue);
compareExprStatsBuilder.setMaxExpr(maxOptionLiteral);
compareExprStatsBuilder.setMinValue(minOptionValue);
compareExprStatsBuilder.setMinExpr(minOptionLiteral);
if (validInOptCount < compareExprStats.ndv) {
compareExprStatsBuilder.setNdv(validInOptCount);
selectivity = StatsMathUtil.divide(validInOptCount, compareExprStats.ndv);
} else {
selectivity = 1.0;
}
}
maxOption = Math.min(maxOption, compareExprStats.maxValue);
minOption = Math.max(minOption, compareExprStats.minValue);
compareExprStatsBuilder.setMaxValue(maxOption);
compareExprStatsBuilder.setMinValue(minOption);
selectivity = StatsMathUtil.minNonNaN(1.0, validInOptCount / compareExprStats.ndv);
compareExprStatsBuilder.setNdv(validInOptCount);
Statistics estimated = new Statistics(context.statistics);
estimated = estimated.withSel(selectivity);
if (compareExpr instanceof SlotReference) {
estimated.addColumnStats(compareExpr,
compareExprStatsBuilder.build());
}
estimated.addColumnStats(compareExpr,
compareExprStatsBuilder.build());
return estimated;
}

View File

@ -17,6 +17,7 @@
package org.apache.doris.nereids.stats;
import org.apache.doris.analysis.IntLiteral;
import org.apache.doris.nereids.trees.expressions.And;
import org.apache.doris.nereids.trees.expressions.Cast;
import org.apache.doris.nereids.trees.expressions.EqualTo;
@ -418,7 +419,9 @@ class FilterEstimationTest {
.setAvgSizeByte(4)
.setNumNulls(0)
.setMinValue(1)
.setMaxValue(10);
.setMinExpr(new IntLiteral(1))
.setMaxValue(10)
.setMaxExpr(new IntLiteral(10));
slotToColumnStat.put(a, builder.build());
Statistics stat = new Statistics(1000, slotToColumnStat);
FilterEstimation filterEstimation = new FilterEstimation();