[opt](nereids) compare str literal as date literal to compute selectivity (#35610)

this pr improves #34542, when the real data type is date-like type.

Some users are likely to define date(datetime) column as Varchar type.
when estimating the selectivity of predicate like A>'2020-01-01', if
nereids regards A and '2020-01-01' as date type, the sel is more
accurate than that as string type.
This commit is contained in:
minghong
2024-05-30 14:32:14 +08:00
committed by yiguolei
parent 3efab570df
commit 8f264a7206
4 changed files with 159 additions and 14 deletions

View File

@ -786,6 +786,10 @@ public class DateLiteral extends LiteralExpr {
return getLongValue();
}
public double getDoubleValueAsDateTime() {
return (year * 10000 + month * 100 + day) * 1000000L + hour * 10000 + minute * 100 + second;
}
@Override
protected void toThrift(TExprNode msg) {
if (type.isDatetimeV2()) {

View File

@ -17,7 +17,9 @@
package org.apache.doris.nereids.stats;
import org.apache.doris.analysis.DateLiteral;
import org.apache.doris.analysis.LiteralExpr;
import org.apache.doris.analysis.StringLiteral;
import org.apache.doris.nereids.stats.FilterEstimation.EstimationContext;
import org.apache.doris.nereids.trees.TreeNode;
import org.apache.doris.nereids.trees.expressions.And;
@ -39,7 +41,10 @@ import org.apache.doris.nereids.trees.expressions.Slot;
import org.apache.doris.nereids.trees.expressions.SlotReference;
import org.apache.doris.nereids.trees.expressions.functions.Function;
import org.apache.doris.nereids.trees.expressions.literal.Literal;
import org.apache.doris.nereids.trees.expressions.literal.StringLikeLiteral;
import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
import org.apache.doris.nereids.types.DataType;
import org.apache.doris.nereids.types.DateTimeType;
import org.apache.doris.nereids.types.coercion.RangeScalable;
import org.apache.doris.statistics.ColumnStatistic;
import org.apache.doris.statistics.ColumnStatisticBuilder;
@ -50,7 +55,10 @@ import org.apache.doris.statistics.StatisticsBuilder;
import com.google.common.base.Preconditions;
import com.google.common.collect.Sets;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.function.Predicate;
@ -183,22 +191,22 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo
}
}
private Statistics updateLessThanLiteral(Expression leftExpr, ColumnStatistic statsForLeft,
private Statistics updateLessThanLiteral(Expression leftExpr, DataType dataType, ColumnStatistic statsForLeft,
ColumnStatistic statsForRight, EstimationContext context) {
StatisticRange rightRange = new StatisticRange(statsForLeft.minValue, statsForLeft.minExpr,
statsForRight.maxValue, statsForRight.maxExpr,
statsForLeft.ndv, leftExpr.getDataType());
return estimateBinaryComparisonFilter(leftExpr,
statsForLeft.ndv, dataType);
return estimateBinaryComparisonFilter(leftExpr, dataType,
statsForLeft,
rightRange, context);
}
private Statistics updateGreaterThanLiteral(Expression leftExpr, ColumnStatistic statsForLeft,
private Statistics updateGreaterThanLiteral(Expression leftExpr, DataType dataType, ColumnStatistic statsForLeft,
ColumnStatistic statsForRight, EstimationContext context) {
StatisticRange rightRange = new StatisticRange(statsForRight.minValue, statsForRight.minExpr,
statsForLeft.maxValue, statsForLeft.maxExpr,
statsForLeft.ndv, leftExpr.getDataType());
return estimateBinaryComparisonFilter(leftExpr, statsForLeft, rightRange, context);
statsForLeft.ndv, dataType);
return estimateBinaryComparisonFilter(leftExpr, dataType, statsForLeft, rightRange, context);
}
private Statistics calculateWhenLiteralRight(ComparisonPredicate cp,
@ -210,16 +218,113 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo
if (cp instanceof EqualPredicate) {
return estimateEqualTo(cp, statsForLeft, statsForRight, context);
} else {
// literal Map used to covert dateLiteral back to stringLiteral
Map<DateLiteral, StringLiteral> literalMap = new HashMap<>();
DataType compareType = cp.left().getDataType();
Optional<ColumnStatistic> statsForLeftMayConvertedOpt =
tryConvertStringColStatsToDateColStats(statsForLeft, literalMap);
Optional<ColumnStatistic> statsForRightMayConvertedOpt = (statsForLeftMayConvertedOpt.isPresent())
? tryConvertStringColStatsToDateColStats(statsForRight, literalMap)
: Optional.empty();
boolean converted = false;
ColumnStatistic statsForLeftMayConverted = statsForLeft;
ColumnStatistic statsForRightMayConverted = statsForRight;
if (statsForLeftMayConvertedOpt.isPresent() && statsForRightMayConvertedOpt.isPresent()
&& statsForRightMayConvertedOpt.get().minExpr.getType()
== statsForLeftMayConvertedOpt.get().minExpr.getType()) {
// string type is converted to date type
converted = true;
compareType = DateTimeType.INSTANCE;
statsForLeftMayConverted = statsForLeftMayConvertedOpt.get();
statsForRightMayConverted = statsForRightMayConvertedOpt.get();
}
Statistics result = null;
if (cp instanceof LessThan || cp instanceof LessThanEqual) {
return updateLessThanLiteral(cp.left(), statsForLeft, statsForRight, context);
result = updateLessThanLiteral(cp.left(), compareType, statsForLeftMayConverted,
statsForRightMayConverted, context);
} else if (cp instanceof GreaterThan || cp instanceof GreaterThanEqual) {
return updateGreaterThanLiteral(cp.left(), statsForLeft, statsForRight, context);
result = updateGreaterThanLiteral(cp.left(), compareType, statsForLeftMayConverted,
statsForRightMayConverted, context);
} else {
throw new RuntimeException(String.format("Unexpected expression : %s", cp.toSql()));
}
if (converted) {
// convert min/max of left.colStats back to string type
ColumnStatistic newLeftStats = result.findColumnStatistics(cp.left());
result.addColumnStats(cp.left(), convertDateColStatsToStringColStats(newLeftStats, literalMap));
}
return result;
}
}
private ColumnStatistic convertDateColStatsToStringColStats(ColumnStatistic colStats,
Map<DateLiteral, StringLiteral> literalMap) {
if (colStats.minExpr == null && colStats.maxExpr == null) {
// when sel=0, minExpr and maxExpr are both null
return colStats;
}
Preconditions.checkArgument(colStats.minExpr instanceof DateLiteral
&& colStats.maxExpr instanceof DateLiteral,
"cannot convert colStats back to stringType %s", colStats.toString());
ColumnStatisticBuilder builder = new ColumnStatisticBuilder(colStats);
StringLiteral newMinLiteral = new StringLiteral(colStats.maxExpr.toString());
return builder.setMaxExpr(newMinLiteral)
.setMaxExpr(literalMap.get(colStats.maxExpr))
.setMaxValue(StringLikeLiteral.getDouble(colStats.maxExpr.toString()))
.setMinExpr(literalMap.get(colStats.minExpr))
.setMinValue(StringLikeLiteral.getDouble(colStats.minExpr.getStringValue()))
.build();
}
private Optional<ColumnStatistic> tryConvertStringColStatsToDateColStats(ColumnStatistic colStats,
Map<DateLiteral, StringLiteral> literalMap) {
if (colStats.minExpr == null || colStats.maxExpr == null) {
return Optional.empty();
}
if (!(colStats.minExpr instanceof StringLiteral) || !(colStats.maxExpr instanceof StringLiteral)) {
return Optional.empty();
}
Optional<DateLiteral> newMinExpr = tryConvertStrLiteralToDateLiteral(colStats.minExpr);
if (newMinExpr.isEmpty()) {
return Optional.empty();
}
Optional<DateLiteral> newMaxExpr = tryConvertStrLiteralToDateLiteral(colStats.maxExpr);
if (newMaxExpr.isEmpty()) {
return Optional.empty();
}
if (newMaxExpr.get().getType() != newMinExpr.get().getType()) {
return Optional.empty();
}
literalMap.put(newMinExpr.get(), (StringLiteral) colStats.minExpr);
literalMap.put(newMaxExpr.get(), (StringLiteral) colStats.maxExpr);
ColumnStatisticBuilder builder = new ColumnStatisticBuilder(colStats);
return Optional.of(builder.setMinValue(newMinExpr.get().getDoubleValueAsDateTime())
.setMinExpr(newMinExpr.get())
.setMaxValue(newMaxExpr.get().getDoubleValueAsDateTime())
.setMaxExpr(newMaxExpr.get())
.build());
}
private Optional<DateLiteral> tryConvertStrLiteralToDateLiteral(LiteralExpr literal) {
if (literal == null) {
return Optional.empty();
}
if (!(literal instanceof StringLiteral)) {
return Optional.empty();
}
DateLiteral dt = null;
try {
dt = new DateLiteral(literal.getStringValue());
dt.checkValueValid();
} catch (Exception e) {
// ignore
}
return dt == null ? Optional.empty() : Optional.of(dt);
}
private Statistics estimateEqualTo(ComparisonPredicate cp, ColumnStatistic statsForLeft,
ColumnStatistic statsForRight,
EstimationContext context) {
@ -467,11 +572,11 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo
}
}
private Statistics estimateBinaryComparisonFilter(Expression leftExpr, ColumnStatistic leftStats,
private Statistics estimateBinaryComparisonFilter(Expression leftExpr, DataType dataType, ColumnStatistic leftStats,
StatisticRange rightRange, EstimationContext context) {
StatisticRange leftRange =
new StatisticRange(leftStats.minValue, leftStats.minExpr, leftStats.maxValue, leftStats.maxExpr,
leftStats.ndv, leftExpr.getDataType());
leftStats.ndv, dataType);
StatisticRange intersectRange = leftRange.cover(rightRange);
ColumnStatisticBuilder leftColumnStatisticBuilder;
@ -495,7 +600,7 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo
.setNdv(intersectRange.getDistinctValues())
.setNumNulls(0);
double sel = leftRange.overlapPercentWith(rightRange);
if (!(leftExpr.getDataType() instanceof RangeScalable) && (sel != 0.0 && sel != 1.0)) {
if (!(dataType instanceof RangeScalable) && (sel != 0.0 && sel != 1.0)) {
sel = DEFAULT_INEQUALITY_COEFFICIENT;
}
sel = getNotNullSelectivity(leftStats, sel);

View File

@ -39,11 +39,18 @@ public abstract class StringLikeLiteral extends Literal {
@Override
public double getDouble() {
return getDouble(value);
}
/**
* get double value
*/
public static double getDouble(String str) {
long v = 0;
int pos = 0;
int len = Math.min(value.length(), 7);
int len = Math.min(str.length(), 7);
while (pos < len) {
v += Byte.toUnsignedLong(value.getBytes()[pos]) << ((6 - pos) * 8);
v += Byte.toUnsignedLong(str.getBytes()[pos]) << ((6 - pos) * 8);
pos++;
}
return (double) v;