From 490ad1472029ac4853286a1f22fd7ccafce34b72 Mon Sep 17 00:00:00 2001 From: minghong Date: Wed, 26 Jul 2023 19:10:04 +0800 Subject: [PATCH] [stats](nereids)in predicate range adjust (#22156) 1. refactor in-predicate filter estimation example: A in (1, 2, 3, 4) after in-preidcate filter, A.stats.max<=4 and A.stats.min>=1 2. maintain minExpr and maxExpr in in-predicate stats derive --- .../doris/nereids/stats/FilterEstimation.java | 76 ++++++++++++++----- .../nereids/stats/FilterEstimationTest.java | 5 +- 2 files changed, 60 insertions(+), 21 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java index 6cb397437d..44e9a97a8b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java @@ -17,6 +17,7 @@ package org.apache.doris.nereids.stats; +import org.apache.doris.analysis.LiteralExpr; import org.apache.doris.nereids.stats.FilterEstimation.EstimationContext; import org.apache.doris.nereids.trees.TreeNode; import org.apache.doris.nereids.trees.expressions.And; @@ -36,6 +37,7 @@ import org.apache.doris.nereids.trees.expressions.Or; import org.apache.doris.nereids.trees.expressions.Slot; import org.apache.doris.nereids.trees.expressions.SlotReference; import org.apache.doris.nereids.trees.expressions.functions.Function; +import org.apache.doris.nereids.trees.expressions.literal.Literal; import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; import org.apache.doris.statistics.Bucket; import org.apache.doris.statistics.ColumnStatistic; @@ -266,8 +268,12 @@ public class FilterEstimation extends ExpressionVisitor options = inPredicate.getOptions(); - double maxOption = 0; - double minOption = Double.MAX_VALUE; + // init minOption and maxOption by compareExpr.max and compareExpr.min respectively, + // and then adjust min/max by options + double minOptionValue = compareExprStats.maxValue; + double maxOptionValue = compareExprStats.minValue; + LiteralExpr minOptionLiteral = compareExprStats.maxExpr; + LiteralExpr maxOptionLiteral = compareExprStats.minExpr; /* suppose A.(min, max) = (0, 10), A.ndv=10 A in ( 1, 2, 5, 100): validInOptCount = 3, that is (1, 2, 5) @@ -283,32 +289,62 @@ public class FilterEstimation extends ExpressionVisitor 0.0) { - validInOptCount += validOptionNdv; - maxOption = Math.max(optionStats.maxValue, maxOption); - minOption = Math.min(optionStats.minValue, minOption); + if (option instanceof Literal) { + // remove the options which is out of compareExpr.range + if (compareExprStats.minValue <= optionStats.maxValue + && optionStats.maxValue <= compareExprStats.maxValue) { + validInOptCount++; + LiteralExpr optionLiteralExpr = ((Literal) option).toLegacyLiteral(); + if (maxOptionLiteral == null || optionLiteralExpr.compareTo(maxOptionLiteral) >= 0) { + maxOptionLiteral = optionLiteralExpr; + maxOptionValue = optionStats.maxValue; + } + + if (minOptionLiteral == null || optionLiteralExpr.compareTo(minOptionLiteral) <= 0) { + minOptionLiteral = optionLiteralExpr; + minOptionValue = optionStats.minValue; + } + } + } else { + nonLiteralOptionCount++; + } + } + if (nonLiteralOptionCount > 0) { + // A in (x+1, ...) + // "x+1" is not literal, and if const-fold can not handle it, it blocks estimation of min/max value. + // and hence, we do not adjust compareExpr.stats.range. + int newNdv = nonLiteralOptionCount + validInOptCount; + if (newNdv < compareExprStats.ndv) { + compareExprStatsBuilder.setNdv(newNdv); + selectivity = StatsMathUtil.divide(newNdv, compareExprStats.ndv); + } else { + selectivity = 1.0; + } + } else { + maxOptionValue = Math.min(maxOptionValue, compareExprStats.maxValue); + minOptionValue = Math.max(minOptionValue, compareExprStats.minValue); + compareExprStatsBuilder.setMaxValue(maxOptionValue); + compareExprStatsBuilder.setMaxExpr(maxOptionLiteral); + compareExprStatsBuilder.setMinValue(minOptionValue); + compareExprStatsBuilder.setMinExpr(minOptionLiteral); + if (validInOptCount < compareExprStats.ndv) { + compareExprStatsBuilder.setNdv(validInOptCount); + selectivity = StatsMathUtil.divide(validInOptCount, compareExprStats.ndv); + } else { + selectivity = 1.0; } } - maxOption = Math.min(maxOption, compareExprStats.maxValue); - minOption = Math.max(minOption, compareExprStats.minValue); - compareExprStatsBuilder.setMaxValue(maxOption); - compareExprStatsBuilder.setMinValue(minOption); - - selectivity = StatsMathUtil.minNonNaN(1.0, validInOptCount / compareExprStats.ndv); - compareExprStatsBuilder.setNdv(validInOptCount); Statistics estimated = new Statistics(context.statistics); estimated = estimated.withSel(selectivity); - if (compareExpr instanceof SlotReference) { - estimated.addColumnStats(compareExpr, - compareExprStatsBuilder.build()); - } + estimated.addColumnStats(compareExpr, + compareExprStatsBuilder.build()); + return estimated; } diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java index 1fe5e5b0a0..394dacdee8 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java @@ -17,6 +17,7 @@ package org.apache.doris.nereids.stats; +import org.apache.doris.analysis.IntLiteral; import org.apache.doris.nereids.trees.expressions.And; import org.apache.doris.nereids.trees.expressions.Cast; import org.apache.doris.nereids.trees.expressions.EqualTo; @@ -418,7 +419,9 @@ class FilterEstimationTest { .setAvgSizeByte(4) .setNumNulls(0) .setMinValue(1) - .setMaxValue(10); + .setMinExpr(new IntLiteral(1)) + .setMaxValue(10) + .setMaxExpr(new IntLiteral(10)); slotToColumnStat.put(a, builder.build()); Statistics stat = new Statistics(1000, slotToColumnStat); FilterEstimation filterEstimation = new FilterEstimation();