[improve](nereids)compute statsRange.length() according to the column datatype (#18331)

we map date/datetime/V2 to double. this map reserves date order, but it does not reserve range length.
For example, from 1990-01-01 to 1991-01-01, there are 12 months. for filter `A < 1990-02-01`, the selectivity
should be `1/12`.

if we compute this filter by their corresponding double value,
`sel = (19900201 - 19900101) / (19910101 - 19900101) = 100/10000 = 1/100`

the error is about 10 times.
This pr aims to fix this error.
Describe your changes.

Solution:
convert double to its corresponding dataType(date/datev2), then compute the range length with respect to its datatype.
This commit is contained in:
minghong
2023-04-04 14:20:34 +08:00
committed by GitHub
parent 175e5d405c
commit 3fc8c19735
8 changed files with 94 additions and 27 deletions

View File

@ -176,7 +176,8 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo
return estimateLessThanLiteralWithHistogram(leftExpr, statsForLeft, val, context, contains);
}
//rightRange.distinctValues should not be used
StatisticRange rightRange = new StatisticRange(statsForLeft.minValue, val, statsForLeft.ndv);
StatisticRange rightRange = new StatisticRange(statsForLeft.minValue, val, statsForLeft.ndv,
leftExpr.getDataType());
return estimateBinaryComparisonFilter(leftExpr,
statsForLeft,
rightRange, context);
@ -189,7 +190,7 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo
}
//rightRange.distinctValues should not be used
StatisticRange rightRange = new StatisticRange(val, statsForLeft.maxValue,
statsForLeft.ndv);
statsForLeft.ndv, leftExpr.getDataType());
return estimateBinaryComparisonFilter(leftExpr, statsForLeft, rightRange, context);
}
@ -360,7 +361,7 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo
private Statistics estimateBinaryComparisonFilter(Expression leftExpr, ColumnStatistic leftStats,
StatisticRange rightRange, EstimationContext context) {
StatisticRange leftRange =
new StatisticRange(leftStats.minValue, leftStats.maxValue, leftStats.ndv);
new StatisticRange(leftStats.minValue, leftStats.maxValue, leftStats.ndv, leftExpr.getDataType());
StatisticRange intersectRange = leftRange.cover(rightRange);
ColumnStatisticBuilder leftColumnStatisticBuilder = new ColumnStatisticBuilder(leftStats)
.setMinValue(intersectRange.getLow())
@ -375,8 +376,8 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo
private Statistics estimateColumnEqualToColumn(Expression leftExpr, ColumnStatistic leftStats,
Expression rightExpr, ColumnStatistic rightStats, EstimationContext context) {
StatisticRange leftRange = StatisticRange.from(leftStats);
StatisticRange rightRange = StatisticRange.from(rightStats);
StatisticRange leftRange = StatisticRange.from(leftStats, leftExpr.getDataType());
StatisticRange rightRange = StatisticRange.from(rightStats, rightExpr.getDataType());
StatisticRange leftIntersectRight = leftRange.intersect(rightRange);
StatisticRange rightIntersectLeft = rightRange.intersect(leftIntersectRight);
ColumnStatisticBuilder leftBuilder = new ColumnStatisticBuilder(leftStats);
@ -396,8 +397,8 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo
private Statistics estimateColumnLessThanColumn(Expression leftExpr, ColumnStatistic leftStats,
Expression rightExpr, ColumnStatistic rightStats, EstimationContext context) {
StatisticRange leftRange = StatisticRange.from(leftStats);
StatisticRange rightRange = StatisticRange.from(rightStats);
StatisticRange leftRange = StatisticRange.from(leftStats, leftExpr.getDataType());
StatisticRange rightRange = StatisticRange.from(rightStats, rightExpr.getDataType());
Statistics statistics = null;
// Left always less than Right
if (leftRange.getHigh() < rightRange.getLow()) {
@ -414,7 +415,7 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo
return context.statistics.withRowCount(0.0);
}
StatisticRange leftAlwaysLessThanRightRange = new StatisticRange(leftStats.minValue,
rightStats.minValue, Double.NaN);
rightStats.minValue, Double.NaN, leftExpr.getDataType());
double leftAlwaysLessThanRightPercent = 0;
if (leftRange.getLow() < rightRange.getLow()) {
leftAlwaysLessThanRightPercent = leftRange.overlapPercentWith(leftAlwaysLessThanRightRange);
@ -429,7 +430,7 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo
double rightAlwaysGreaterRangeFraction = 0;
if (leftRange.getHigh() < rightRange.getHigh()) {
rightAlwaysGreaterRangeFraction = rightRange.overlapPercentWith(new StatisticRange(leftRange.getHigh(),
rightRange.getHigh(), Double.NaN));
rightRange.getHigh(), Double.NaN, rightExpr.getDataType()));
}
ColumnStatistic rightColumnStatistic = new ColumnStatisticBuilder(rightStats)
.setMinValue(Math.max(leftRange.getLow(), rightRange.getLow()))

View File

@ -89,6 +89,7 @@ import org.apache.doris.nereids.trees.plans.physical.PhysicalTopN;
import org.apache.doris.nereids.trees.plans.physical.PhysicalUnion;
import org.apache.doris.nereids.trees.plans.physical.PhysicalWindow;
import org.apache.doris.nereids.trees.plans.visitor.DefaultPlanVisitor;
import org.apache.doris.nereids.types.DataType;
import org.apache.doris.statistics.ColumnLevelStatisticCache;
import org.apache.doris.statistics.ColumnStatistic;
import org.apache.doris.statistics.ColumnStatisticBuilder;
@ -577,7 +578,7 @@ public class StatsCalculator extends DefaultPlanVisitor<Statistics, Void> {
double rightRowCount = childStats.get(j).getRowCount();
ColumnStatistic estimatedColumnStatistics
= unionColumn(headStats.findColumnStatistics(headSlot),
headStats.getRowCount(), rightStatistic, rightRowCount);
headStats.getRowCount(), rightStatistic, rightRowCount, headSlot.getDataType());
headStats.addColumnStats(headSlot, estimatedColumnStatistics);
leftRowCount += childStats.get(j).getRowCount();
}
@ -692,12 +693,12 @@ public class StatsCalculator extends DefaultPlanVisitor<Statistics, Void> {
}
private ColumnStatistic unionColumn(ColumnStatistic leftStats, double leftRowCount, ColumnStatistic rightStats,
double rightRowCount) {
double rightRowCount, DataType dataType) {
ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder();
columnStatisticBuilder.setMaxValue(Math.max(leftStats.maxValue, rightStats.maxValue));
columnStatisticBuilder.setMinValue(Math.min(leftStats.minValue, rightStats.minValue));
StatisticRange leftRange = StatisticRange.from(leftStats);
StatisticRange rightRange = StatisticRange.from(rightStats);
StatisticRange leftRange = StatisticRange.from(leftStats, dataType);
StatisticRange rightRange = StatisticRange.from(rightStats, dataType);
StatisticRange newRange = leftRange.union(rightRange);
double newRowCount = leftRowCount + rightRowCount;
double leftSize = (leftRowCount - leftStats.numNulls) * leftStats.avgSizeByte;

View File

@ -148,6 +148,11 @@ public class DateLiteral extends Literal {
return (year * 10000 + month * 100 + day) * 1000000L;
}
@Override
public double getDouble() {
return (double) getValue();
}
@Override
public String getStringValue() {
return String.format("%04d-%02d-%02d", year, month, day);

View File

@ -247,6 +247,11 @@ public class DateTimeLiteral extends DateLiteral {
return (year * 10000 + month * 100 + day) * 1000000L + hour * 10000 + minute * 100 + second;
}
@Override
public double getDouble() {
return (double) getValue();
}
@Override
public String toSql() {
return toString();

View File

@ -587,4 +587,8 @@ public abstract class DataType implements AbstractDataType {
return (Map) ImmutableMap.copyOf(promotionMap);
}
}
public double rangeLength(double high, double low) {
return high - low;
}
}

View File

@ -17,8 +17,30 @@
package org.apache.doris.nereids.types.coercion;
import java.time.temporal.ChronoUnit;
import java.util.Calendar;
/**
* date like type.
*/
public abstract class DateLikeType extends PrimitiveType {
private Calendar toCalendar(double d) {
//d = (year * 10000 + month * 100 + day) * 1000000L;
int date = (int) (d / 1000000);
int day = date % 100;
int month = (date / 100) % 100;
int year = date / 10000;
Calendar calendar = Calendar.getInstance();
calendar.set(Calendar.YEAR, year);
calendar.set(Calendar.MONDAY, month);
calendar.set(Calendar.DAY_OF_MONTH, day);
return calendar;
}
@Override
public double rangeLength(double high, double low) {
Calendar to = toCalendar(high);
Calendar from = toCalendar(low);
return ChronoUnit.DAYS.between(from.toInstant(), to.toInstant());
}
}

View File

@ -17,6 +17,8 @@
package org.apache.doris.statistics;
import org.apache.doris.nereids.types.DataType;
import java.util.Objects;
public class StatisticRange {
@ -34,10 +36,13 @@ public class StatisticRange {
private final double distinctValues;
public StatisticRange(double low, double high, double distinctValues) {
private final DataType dataType;
public StatisticRange(double low, double high, double distinctValues, DataType dataType) {
this.low = low;
this.high = high;
this.distinctValues = distinctValues;
this.dataType = dataType;
}
public double overlapPercentWith(StatisticRange other) {
@ -50,7 +55,7 @@ public class StatisticRange {
return 1.0;
}
double lengthOfIntersect = Math.min(this.high, other.high) - Math.max(this.low, other.low);
double lengthOfIntersect = dataType.rangeLength(Math.min(this.high, other.high), Math.max(this.low, other.low));
if (Double.isInfinite(lengthOfIntersect)) {
if (Double.isFinite(this.distinctValues) && Double.isFinite(other.distinctValues)) {
return Math.min(other.distinctValues / this.distinctValues, 1);
@ -73,8 +78,8 @@ public class StatisticRange {
return INFINITE_TO_FINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR;
}
public static StatisticRange empty() {
return new StatisticRange(Double.NaN, Double.NaN, 0);
public static StatisticRange empty(DataType dataType) {
return new StatisticRange(Double.NaN, Double.NaN, 0, dataType);
}
public boolean isEmpty() {
@ -85,8 +90,8 @@ public class StatisticRange {
return Double.isInfinite(low) && Double.isInfinite(high);
}
public static StatisticRange from(ColumnStatistic column) {
return new StatisticRange(column.minValue, column.maxValue, column.ndv);
public static StatisticRange from(ColumnStatistic column, DataType dataType) {
return new StatisticRange(column.minValue, column.maxValue, column.ndv, dataType);
}
public double getLow() {
@ -98,16 +103,16 @@ public class StatisticRange {
}
public double length() {
return this.high - this.low;
return dataType.rangeLength(this.high, this.low);
}
public StatisticRange intersect(StatisticRange other) {
double newLow = Math.max(low, other.low);
double newHigh = Math.min(high, other.high);
if (newLow <= newHigh) {
return new StatisticRange(newLow, newHigh, overlappingDistinctValues(other));
return new StatisticRange(newLow, newHigh, overlappingDistinctValues(other), dataType);
}
return empty();
return empty(dataType);
}
public StatisticRange cover(StatisticRange other) {
@ -117,9 +122,9 @@ public class StatisticRange {
double overlapPercentOfLeft = overlapPercentWith(other);
double overlapDistinctValuesLeft = overlapPercentOfLeft * distinctValues;
double coveredDistinctValues = minExcludeNaN(distinctValues, overlapDistinctValuesLeft);
return new StatisticRange(newLow, newHigh, coveredDistinctValues);
return new StatisticRange(newLow, newHigh, coveredDistinctValues, dataType);
}
return empty();
return empty(dataType);
}
public StatisticRange union(StatisticRange other) {
@ -130,7 +135,7 @@ public class StatisticRange {
double maxOverlapNDV = Math.max(overlapNDVThis, overlapNDVOther);
double newNDV = maxOverlapNDV + ((1 - overlapPercentThis) * distinctValues)
+ ((1 - overlapPercentOther) * other.distinctValues);
return new StatisticRange(Math.min(low, other.low), Math.max(high, other.high), newNDV);
return new StatisticRange(Math.min(low, other.low), Math.max(high, other.high), newNDV, dataType);
}
private double overlappingDistinctValues(StatisticRange other) {
@ -168,7 +173,7 @@ public class StatisticRange {
return distinctValues;
}
public static StatisticRange fromColumnStatistics(ColumnStatistic columnStatistic) {
return new StatisticRange(columnStatistic.minValue, columnStatistic.maxValue, columnStatistic.ndv);
public static StatisticRange fromColumnStatistics(ColumnStatistic columnStatistic, DataType dataType) {
return new StatisticRange(columnStatistic.minValue, columnStatistic.maxValue, columnStatistic.ndv, dataType);
}
}

View File

@ -29,8 +29,10 @@ import org.apache.doris.nereids.trees.expressions.LessThanEqual;
import org.apache.doris.nereids.trees.expressions.Not;
import org.apache.doris.nereids.trees.expressions.Or;
import org.apache.doris.nereids.trees.expressions.SlotReference;
import org.apache.doris.nereids.trees.expressions.literal.DateLiteral;
import org.apache.doris.nereids.trees.expressions.literal.DoubleLiteral;
import org.apache.doris.nereids.trees.expressions.literal.IntegerLiteral;
import org.apache.doris.nereids.types.DateType;
import org.apache.doris.nereids.types.DoubleType;
import org.apache.doris.nereids.types.IntegerType;
import org.apache.doris.statistics.ColumnStatistic;
@ -867,4 +869,26 @@ class FilterEstimationTest {
Assertions.assertTrue(colStats != null);
Assertions.assertEquals(10, colStats.ndv, 0.1);
}
@Test
public void testDateRangeSelectivity() {
DateLiteral from = new DateLiteral("1990-01-01");
DateLiteral to = new DateLiteral("2000-01-01");
SlotReference a = new SlotReference("a", DateType.INSTANCE);
ColumnStatisticBuilder builder = new ColumnStatisticBuilder()
.setNdv(100)
.setAvgSizeByte(4)
.setNumNulls(0)
.setMaxValue(to.getDouble())
.setMinValue(from.getDouble())
.setSelectivity(1.0)
.setCount(100);
DateLiteral mid = new DateLiteral("1999-01-01");
GreaterThan greaterThan = new GreaterThan(a, mid);
Statistics stats = new Statistics(100, new HashMap<>());
stats.addColumnStats(a, builder.build());
FilterEstimation filterEstimation = new FilterEstimation();
Statistics result = filterEstimation.estimate(greaterThan, stats);
Assertions.assertEquals(result.getRowCount(), 10, 0.1);
}
}