From 8f264a7206aba5ec78ae4afba937c195b1847b7d Mon Sep 17 00:00:00 2001 From: minghong Date: Thu, 30 May 2024 14:32:14 +0800 Subject: [PATCH] [opt](nereids) compare str literal as date literal to compute selectivity (#35610) this pr improves #34542, when the real data type is date-like type. Some users are likely to define date(datetime) column as Varchar type. when estimating the selectivity of predicate like A>'2020-01-01', if nereids regards A and '2020-01-01' as date type, the sel is more accurate than that as string type. --- .../apache/doris/analysis/DateLiteral.java | 4 + .../doris/nereids/stats/FilterEstimation.java | 127 ++++++++++++++++-- .../literal/StringLikeLiteral.java | 11 +- .../nereids/stats/FilterEstimationTest.java | 31 ++++- 4 files changed, 159 insertions(+), 14 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/DateLiteral.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/DateLiteral.java index a8148237fb..1ff103097e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/DateLiteral.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/DateLiteral.java @@ -786,6 +786,10 @@ public class DateLiteral extends LiteralExpr { return getLongValue(); } + public double getDoubleValueAsDateTime() { + return (year * 10000 + month * 100 + day) * 1000000L + hour * 10000 + minute * 100 + second; + } + @Override protected void toThrift(TExprNode msg) { if (type.isDatetimeV2()) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java index 2286daaa44..17b1eb3938 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java @@ -17,7 +17,9 @@ package org.apache.doris.nereids.stats; +import org.apache.doris.analysis.DateLiteral; import org.apache.doris.analysis.LiteralExpr; +import org.apache.doris.analysis.StringLiteral; import org.apache.doris.nereids.stats.FilterEstimation.EstimationContext; import org.apache.doris.nereids.trees.TreeNode; import org.apache.doris.nereids.trees.expressions.And; @@ -39,7 +41,10 @@ import org.apache.doris.nereids.trees.expressions.Slot; import org.apache.doris.nereids.trees.expressions.SlotReference; import org.apache.doris.nereids.trees.expressions.functions.Function; import org.apache.doris.nereids.trees.expressions.literal.Literal; +import org.apache.doris.nereids.trees.expressions.literal.StringLikeLiteral; import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; +import org.apache.doris.nereids.types.DataType; +import org.apache.doris.nereids.types.DateTimeType; import org.apache.doris.nereids.types.coercion.RangeScalable; import org.apache.doris.statistics.ColumnStatistic; import org.apache.doris.statistics.ColumnStatisticBuilder; @@ -50,7 +55,10 @@ import org.apache.doris.statistics.StatisticsBuilder; import com.google.common.base.Preconditions; import com.google.common.collect.Sets; +import java.util.HashMap; import java.util.List; +import java.util.Map; +import java.util.Optional; import java.util.Set; import java.util.function.Predicate; @@ -183,22 +191,22 @@ public class FilterEstimation extends ExpressionVisitor literalMap = new HashMap<>(); + DataType compareType = cp.left().getDataType(); + Optional statsForLeftMayConvertedOpt = + tryConvertStringColStatsToDateColStats(statsForLeft, literalMap); + Optional statsForRightMayConvertedOpt = (statsForLeftMayConvertedOpt.isPresent()) + ? tryConvertStringColStatsToDateColStats(statsForRight, literalMap) + : Optional.empty(); + + boolean converted = false; + ColumnStatistic statsForLeftMayConverted = statsForLeft; + ColumnStatistic statsForRightMayConverted = statsForRight; + if (statsForLeftMayConvertedOpt.isPresent() && statsForRightMayConvertedOpt.isPresent() + && statsForRightMayConvertedOpt.get().minExpr.getType() + == statsForLeftMayConvertedOpt.get().minExpr.getType()) { + // string type is converted to date type + converted = true; + compareType = DateTimeType.INSTANCE; + statsForLeftMayConverted = statsForLeftMayConvertedOpt.get(); + statsForRightMayConverted = statsForRightMayConvertedOpt.get(); + } + Statistics result = null; if (cp instanceof LessThan || cp instanceof LessThanEqual) { - return updateLessThanLiteral(cp.left(), statsForLeft, statsForRight, context); + result = updateLessThanLiteral(cp.left(), compareType, statsForLeftMayConverted, + statsForRightMayConverted, context); } else if (cp instanceof GreaterThan || cp instanceof GreaterThanEqual) { - return updateGreaterThanLiteral(cp.left(), statsForLeft, statsForRight, context); + result = updateGreaterThanLiteral(cp.left(), compareType, statsForLeftMayConverted, + statsForRightMayConverted, context); } else { throw new RuntimeException(String.format("Unexpected expression : %s", cp.toSql())); } + if (converted) { + // convert min/max of left.colStats back to string type + ColumnStatistic newLeftStats = result.findColumnStatistics(cp.left()); + result.addColumnStats(cp.left(), convertDateColStatsToStringColStats(newLeftStats, literalMap)); + } + return result; } } + private ColumnStatistic convertDateColStatsToStringColStats(ColumnStatistic colStats, + Map literalMap) { + if (colStats.minExpr == null && colStats.maxExpr == null) { + // when sel=0, minExpr and maxExpr are both null + return colStats; + } + Preconditions.checkArgument(colStats.minExpr instanceof DateLiteral + && colStats.maxExpr instanceof DateLiteral, + "cannot convert colStats back to stringType %s", colStats.toString()); + ColumnStatisticBuilder builder = new ColumnStatisticBuilder(colStats); + StringLiteral newMinLiteral = new StringLiteral(colStats.maxExpr.toString()); + return builder.setMaxExpr(newMinLiteral) + .setMaxExpr(literalMap.get(colStats.maxExpr)) + .setMaxValue(StringLikeLiteral.getDouble(colStats.maxExpr.toString())) + .setMinExpr(literalMap.get(colStats.minExpr)) + .setMinValue(StringLikeLiteral.getDouble(colStats.minExpr.getStringValue())) + .build(); + } + + private Optional tryConvertStringColStatsToDateColStats(ColumnStatistic colStats, + Map literalMap) { + if (colStats.minExpr == null || colStats.maxExpr == null) { + return Optional.empty(); + } + if (!(colStats.minExpr instanceof StringLiteral) || !(colStats.maxExpr instanceof StringLiteral)) { + return Optional.empty(); + } + Optional newMinExpr = tryConvertStrLiteralToDateLiteral(colStats.minExpr); + if (newMinExpr.isEmpty()) { + return Optional.empty(); + } + Optional newMaxExpr = tryConvertStrLiteralToDateLiteral(colStats.maxExpr); + if (newMaxExpr.isEmpty()) { + return Optional.empty(); + } + if (newMaxExpr.get().getType() != newMinExpr.get().getType()) { + return Optional.empty(); + } + literalMap.put(newMinExpr.get(), (StringLiteral) colStats.minExpr); + literalMap.put(newMaxExpr.get(), (StringLiteral) colStats.maxExpr); + + ColumnStatisticBuilder builder = new ColumnStatisticBuilder(colStats); + return Optional.of(builder.setMinValue(newMinExpr.get().getDoubleValueAsDateTime()) + .setMinExpr(newMinExpr.get()) + .setMaxValue(newMaxExpr.get().getDoubleValueAsDateTime()) + .setMaxExpr(newMaxExpr.get()) + .build()); + } + + private Optional tryConvertStrLiteralToDateLiteral(LiteralExpr literal) { + if (literal == null) { + return Optional.empty(); + } + if (!(literal instanceof StringLiteral)) { + return Optional.empty(); + } + + DateLiteral dt = null; + try { + dt = new DateLiteral(literal.getStringValue()); + dt.checkValueValid(); + } catch (Exception e) { + // ignore + } + return dt == null ? Optional.empty() : Optional.of(dt); + } + private Statistics estimateEqualTo(ComparisonPredicate cp, ColumnStatistic statsForLeft, ColumnStatistic statsForRight, EstimationContext context) { @@ -467,11 +572,11 @@ public class FilterEstimation extends ExpressionVisitor