From 3fc8c19735e55bee1e475124d42b155a1dea1d8e Mon Sep 17 00:00:00 2001 From: minghong Date: Tue, 4 Apr 2023 14:20:34 +0800 Subject: [PATCH] [improve](nereids)compute statsRange.length() according to the column datatype (#18331) we map date/datetime/V2 to double. this map reserves date order, but it does not reserve range length. For example, from 1990-01-01 to 1991-01-01, there are 12 months. for filter `A < 1990-02-01`, the selectivity should be `1/12`. if we compute this filter by their corresponding double value, `sel = (19900201 - 19900101) / (19910101 - 19900101) = 100/10000 = 1/100` the error is about 10 times. This pr aims to fix this error. Describe your changes. Solution: convert double to its corresponding dataType(date/datev2), then compute the range length with respect to its datatype. --- .../doris/nereids/stats/FilterEstimation.java | 19 ++++++----- .../doris/nereids/stats/StatsCalculator.java | 9 ++--- .../expressions/literal/DateLiteral.java | 5 +++ .../expressions/literal/DateTimeLiteral.java | 5 +++ .../apache/doris/nereids/types/DataType.java | 4 +++ .../nereids/types/coercion/DateLikeType.java | 22 +++++++++++++ .../doris/statistics/StatisticRange.java | 33 +++++++++++-------- .../nereids/stats/FilterEstimationTest.java | 24 ++++++++++++++ 8 files changed, 94 insertions(+), 27 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java index 7cebe8f81d..0fd9f1e9b1 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java @@ -176,7 +176,8 @@ public class FilterEstimation extends ExpressionVisitor { double rightRowCount = childStats.get(j).getRowCount(); ColumnStatistic estimatedColumnStatistics = unionColumn(headStats.findColumnStatistics(headSlot), - headStats.getRowCount(), rightStatistic, rightRowCount); + headStats.getRowCount(), rightStatistic, rightRowCount, headSlot.getDataType()); headStats.addColumnStats(headSlot, estimatedColumnStatistics); leftRowCount += childStats.get(j).getRowCount(); } @@ -692,12 +693,12 @@ public class StatsCalculator extends DefaultPlanVisitor { } private ColumnStatistic unionColumn(ColumnStatistic leftStats, double leftRowCount, ColumnStatistic rightStats, - double rightRowCount) { + double rightRowCount, DataType dataType) { ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder(); columnStatisticBuilder.setMaxValue(Math.max(leftStats.maxValue, rightStats.maxValue)); columnStatisticBuilder.setMinValue(Math.min(leftStats.minValue, rightStats.minValue)); - StatisticRange leftRange = StatisticRange.from(leftStats); - StatisticRange rightRange = StatisticRange.from(rightStats); + StatisticRange leftRange = StatisticRange.from(leftStats, dataType); + StatisticRange rightRange = StatisticRange.from(rightStats, dataType); StatisticRange newRange = leftRange.union(rightRange); double newRowCount = leftRowCount + rightRowCount; double leftSize = (leftRowCount - leftStats.numNulls) * leftStats.avgSizeByte; diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/DateLiteral.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/DateLiteral.java index f401d73b0c..da3dd4dfc3 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/DateLiteral.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/DateLiteral.java @@ -148,6 +148,11 @@ public class DateLiteral extends Literal { return (year * 10000 + month * 100 + day) * 1000000L; } + @Override + public double getDouble() { + return (double) getValue(); + } + @Override public String getStringValue() { return String.format("%04d-%02d-%02d", year, month, day); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/DateTimeLiteral.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/DateTimeLiteral.java index a0a9abbd81..c97e1612e3 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/DateTimeLiteral.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/literal/DateTimeLiteral.java @@ -247,6 +247,11 @@ public class DateTimeLiteral extends DateLiteral { return (year * 10000 + month * 100 + day) * 1000000L + hour * 10000 + minute * 100 + second; } + @Override + public double getDouble() { + return (double) getValue(); + } + @Override public String toSql() { return toString(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/DataType.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/DataType.java index f14c9e0e43..5e6706e9b0 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/DataType.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/DataType.java @@ -587,4 +587,8 @@ public abstract class DataType implements AbstractDataType { return (Map) ImmutableMap.copyOf(promotionMap); } } + + public double rangeLength(double high, double low) { + return high - low; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/DateLikeType.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/DateLikeType.java index ee491fa18d..7defd1b61f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/DateLikeType.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/DateLikeType.java @@ -17,8 +17,30 @@ package org.apache.doris.nereids.types.coercion; +import java.time.temporal.ChronoUnit; +import java.util.Calendar; + /** * date like type. */ public abstract class DateLikeType extends PrimitiveType { + private Calendar toCalendar(double d) { + //d = (year * 10000 + month * 100 + day) * 1000000L; + int date = (int) (d / 1000000); + int day = date % 100; + int month = (date / 100) % 100; + int year = date / 10000; + Calendar calendar = Calendar.getInstance(); + calendar.set(Calendar.YEAR, year); + calendar.set(Calendar.MONDAY, month); + calendar.set(Calendar.DAY_OF_MONTH, day); + return calendar; + } + + @Override + public double rangeLength(double high, double low) { + Calendar to = toCalendar(high); + Calendar from = toCalendar(low); + return ChronoUnit.DAYS.between(from.toInstant(), to.toInstant()); + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticRange.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticRange.java index bbc95169e9..74055e62f1 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticRange.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticRange.java @@ -17,6 +17,8 @@ package org.apache.doris.statistics; +import org.apache.doris.nereids.types.DataType; + import java.util.Objects; public class StatisticRange { @@ -34,10 +36,13 @@ public class StatisticRange { private final double distinctValues; - public StatisticRange(double low, double high, double distinctValues) { + private final DataType dataType; + + public StatisticRange(double low, double high, double distinctValues, DataType dataType) { this.low = low; this.high = high; this.distinctValues = distinctValues; + this.dataType = dataType; } public double overlapPercentWith(StatisticRange other) { @@ -50,7 +55,7 @@ public class StatisticRange { return 1.0; } - double lengthOfIntersect = Math.min(this.high, other.high) - Math.max(this.low, other.low); + double lengthOfIntersect = dataType.rangeLength(Math.min(this.high, other.high), Math.max(this.low, other.low)); if (Double.isInfinite(lengthOfIntersect)) { if (Double.isFinite(this.distinctValues) && Double.isFinite(other.distinctValues)) { return Math.min(other.distinctValues / this.distinctValues, 1); @@ -73,8 +78,8 @@ public class StatisticRange { return INFINITE_TO_FINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR; } - public static StatisticRange empty() { - return new StatisticRange(Double.NaN, Double.NaN, 0); + public static StatisticRange empty(DataType dataType) { + return new StatisticRange(Double.NaN, Double.NaN, 0, dataType); } public boolean isEmpty() { @@ -85,8 +90,8 @@ public class StatisticRange { return Double.isInfinite(low) && Double.isInfinite(high); } - public static StatisticRange from(ColumnStatistic column) { - return new StatisticRange(column.minValue, column.maxValue, column.ndv); + public static StatisticRange from(ColumnStatistic column, DataType dataType) { + return new StatisticRange(column.minValue, column.maxValue, column.ndv, dataType); } public double getLow() { @@ -98,16 +103,16 @@ public class StatisticRange { } public double length() { - return this.high - this.low; + return dataType.rangeLength(this.high, this.low); } public StatisticRange intersect(StatisticRange other) { double newLow = Math.max(low, other.low); double newHigh = Math.min(high, other.high); if (newLow <= newHigh) { - return new StatisticRange(newLow, newHigh, overlappingDistinctValues(other)); + return new StatisticRange(newLow, newHigh, overlappingDistinctValues(other), dataType); } - return empty(); + return empty(dataType); } public StatisticRange cover(StatisticRange other) { @@ -117,9 +122,9 @@ public class StatisticRange { double overlapPercentOfLeft = overlapPercentWith(other); double overlapDistinctValuesLeft = overlapPercentOfLeft * distinctValues; double coveredDistinctValues = minExcludeNaN(distinctValues, overlapDistinctValuesLeft); - return new StatisticRange(newLow, newHigh, coveredDistinctValues); + return new StatisticRange(newLow, newHigh, coveredDistinctValues, dataType); } - return empty(); + return empty(dataType); } public StatisticRange union(StatisticRange other) { @@ -130,7 +135,7 @@ public class StatisticRange { double maxOverlapNDV = Math.max(overlapNDVThis, overlapNDVOther); double newNDV = maxOverlapNDV + ((1 - overlapPercentThis) * distinctValues) + ((1 - overlapPercentOther) * other.distinctValues); - return new StatisticRange(Math.min(low, other.low), Math.max(high, other.high), newNDV); + return new StatisticRange(Math.min(low, other.low), Math.max(high, other.high), newNDV, dataType); } private double overlappingDistinctValues(StatisticRange other) { @@ -168,7 +173,7 @@ public class StatisticRange { return distinctValues; } - public static StatisticRange fromColumnStatistics(ColumnStatistic columnStatistic) { - return new StatisticRange(columnStatistic.minValue, columnStatistic.maxValue, columnStatistic.ndv); + public static StatisticRange fromColumnStatistics(ColumnStatistic columnStatistic, DataType dataType) { + return new StatisticRange(columnStatistic.minValue, columnStatistic.maxValue, columnStatistic.ndv, dataType); } } diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java index dbe1d9a1a1..9452eb6ff8 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java @@ -29,8 +29,10 @@ import org.apache.doris.nereids.trees.expressions.LessThanEqual; import org.apache.doris.nereids.trees.expressions.Not; import org.apache.doris.nereids.trees.expressions.Or; import org.apache.doris.nereids.trees.expressions.SlotReference; +import org.apache.doris.nereids.trees.expressions.literal.DateLiteral; import org.apache.doris.nereids.trees.expressions.literal.DoubleLiteral; import org.apache.doris.nereids.trees.expressions.literal.IntegerLiteral; +import org.apache.doris.nereids.types.DateType; import org.apache.doris.nereids.types.DoubleType; import org.apache.doris.nereids.types.IntegerType; import org.apache.doris.statistics.ColumnStatistic; @@ -867,4 +869,26 @@ class FilterEstimationTest { Assertions.assertTrue(colStats != null); Assertions.assertEquals(10, colStats.ndv, 0.1); } + + @Test + public void testDateRangeSelectivity() { + DateLiteral from = new DateLiteral("1990-01-01"); + DateLiteral to = new DateLiteral("2000-01-01"); + SlotReference a = new SlotReference("a", DateType.INSTANCE); + ColumnStatisticBuilder builder = new ColumnStatisticBuilder() + .setNdv(100) + .setAvgSizeByte(4) + .setNumNulls(0) + .setMaxValue(to.getDouble()) + .setMinValue(from.getDouble()) + .setSelectivity(1.0) + .setCount(100); + DateLiteral mid = new DateLiteral("1999-01-01"); + GreaterThan greaterThan = new GreaterThan(a, mid); + Statistics stats = new Statistics(100, new HashMap<>()); + stats.addColumnStats(a, builder.build()); + FilterEstimation filterEstimation = new FilterEstimation(); + Statistics result = filterEstimation.estimate(greaterThan, stats); + Assertions.assertEquals(result.getRowCount(), 10, 0.1); + } }