[improve](nereids)compute statsRange.length() according to the column datatype (#18331)
we map date/datetime/V2 to double. this map reserves date order, but it does not reserve range length. For example, from 1990-01-01 to 1991-01-01, there are 12 months. for filter `A < 1990-02-01`, the selectivity should be `1/12`. if we compute this filter by their corresponding double value, `sel = (19900201 - 19900101) / (19910101 - 19900101) = 100/10000 = 1/100` the error is about 10 times. This pr aims to fix this error. Describe your changes. Solution: convert double to its corresponding dataType(date/datev2), then compute the range length with respect to its datatype.
This commit is contained in:
@ -176,7 +176,8 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo
|
||||
return estimateLessThanLiteralWithHistogram(leftExpr, statsForLeft, val, context, contains);
|
||||
}
|
||||
//rightRange.distinctValues should not be used
|
||||
StatisticRange rightRange = new StatisticRange(statsForLeft.minValue, val, statsForLeft.ndv);
|
||||
StatisticRange rightRange = new StatisticRange(statsForLeft.minValue, val, statsForLeft.ndv,
|
||||
leftExpr.getDataType());
|
||||
return estimateBinaryComparisonFilter(leftExpr,
|
||||
statsForLeft,
|
||||
rightRange, context);
|
||||
@ -189,7 +190,7 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo
|
||||
}
|
||||
//rightRange.distinctValues should not be used
|
||||
StatisticRange rightRange = new StatisticRange(val, statsForLeft.maxValue,
|
||||
statsForLeft.ndv);
|
||||
statsForLeft.ndv, leftExpr.getDataType());
|
||||
return estimateBinaryComparisonFilter(leftExpr, statsForLeft, rightRange, context);
|
||||
}
|
||||
|
||||
@ -360,7 +361,7 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo
|
||||
private Statistics estimateBinaryComparisonFilter(Expression leftExpr, ColumnStatistic leftStats,
|
||||
StatisticRange rightRange, EstimationContext context) {
|
||||
StatisticRange leftRange =
|
||||
new StatisticRange(leftStats.minValue, leftStats.maxValue, leftStats.ndv);
|
||||
new StatisticRange(leftStats.minValue, leftStats.maxValue, leftStats.ndv, leftExpr.getDataType());
|
||||
StatisticRange intersectRange = leftRange.cover(rightRange);
|
||||
ColumnStatisticBuilder leftColumnStatisticBuilder = new ColumnStatisticBuilder(leftStats)
|
||||
.setMinValue(intersectRange.getLow())
|
||||
@ -375,8 +376,8 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo
|
||||
|
||||
private Statistics estimateColumnEqualToColumn(Expression leftExpr, ColumnStatistic leftStats,
|
||||
Expression rightExpr, ColumnStatistic rightStats, EstimationContext context) {
|
||||
StatisticRange leftRange = StatisticRange.from(leftStats);
|
||||
StatisticRange rightRange = StatisticRange.from(rightStats);
|
||||
StatisticRange leftRange = StatisticRange.from(leftStats, leftExpr.getDataType());
|
||||
StatisticRange rightRange = StatisticRange.from(rightStats, rightExpr.getDataType());
|
||||
StatisticRange leftIntersectRight = leftRange.intersect(rightRange);
|
||||
StatisticRange rightIntersectLeft = rightRange.intersect(leftIntersectRight);
|
||||
ColumnStatisticBuilder leftBuilder = new ColumnStatisticBuilder(leftStats);
|
||||
@ -396,8 +397,8 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo
|
||||
|
||||
private Statistics estimateColumnLessThanColumn(Expression leftExpr, ColumnStatistic leftStats,
|
||||
Expression rightExpr, ColumnStatistic rightStats, EstimationContext context) {
|
||||
StatisticRange leftRange = StatisticRange.from(leftStats);
|
||||
StatisticRange rightRange = StatisticRange.from(rightStats);
|
||||
StatisticRange leftRange = StatisticRange.from(leftStats, leftExpr.getDataType());
|
||||
StatisticRange rightRange = StatisticRange.from(rightStats, rightExpr.getDataType());
|
||||
Statistics statistics = null;
|
||||
// Left always less than Right
|
||||
if (leftRange.getHigh() < rightRange.getLow()) {
|
||||
@ -414,7 +415,7 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo
|
||||
return context.statistics.withRowCount(0.0);
|
||||
}
|
||||
StatisticRange leftAlwaysLessThanRightRange = new StatisticRange(leftStats.minValue,
|
||||
rightStats.minValue, Double.NaN);
|
||||
rightStats.minValue, Double.NaN, leftExpr.getDataType());
|
||||
double leftAlwaysLessThanRightPercent = 0;
|
||||
if (leftRange.getLow() < rightRange.getLow()) {
|
||||
leftAlwaysLessThanRightPercent = leftRange.overlapPercentWith(leftAlwaysLessThanRightRange);
|
||||
@ -429,7 +430,7 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo
|
||||
double rightAlwaysGreaterRangeFraction = 0;
|
||||
if (leftRange.getHigh() < rightRange.getHigh()) {
|
||||
rightAlwaysGreaterRangeFraction = rightRange.overlapPercentWith(new StatisticRange(leftRange.getHigh(),
|
||||
rightRange.getHigh(), Double.NaN));
|
||||
rightRange.getHigh(), Double.NaN, rightExpr.getDataType()));
|
||||
}
|
||||
ColumnStatistic rightColumnStatistic = new ColumnStatisticBuilder(rightStats)
|
||||
.setMinValue(Math.max(leftRange.getLow(), rightRange.getLow()))
|
||||
|
||||
@ -89,6 +89,7 @@ import org.apache.doris.nereids.trees.plans.physical.PhysicalTopN;
|
||||
import org.apache.doris.nereids.trees.plans.physical.PhysicalUnion;
|
||||
import org.apache.doris.nereids.trees.plans.physical.PhysicalWindow;
|
||||
import org.apache.doris.nereids.trees.plans.visitor.DefaultPlanVisitor;
|
||||
import org.apache.doris.nereids.types.DataType;
|
||||
import org.apache.doris.statistics.ColumnLevelStatisticCache;
|
||||
import org.apache.doris.statistics.ColumnStatistic;
|
||||
import org.apache.doris.statistics.ColumnStatisticBuilder;
|
||||
@ -577,7 +578,7 @@ public class StatsCalculator extends DefaultPlanVisitor<Statistics, Void> {
|
||||
double rightRowCount = childStats.get(j).getRowCount();
|
||||
ColumnStatistic estimatedColumnStatistics
|
||||
= unionColumn(headStats.findColumnStatistics(headSlot),
|
||||
headStats.getRowCount(), rightStatistic, rightRowCount);
|
||||
headStats.getRowCount(), rightStatistic, rightRowCount, headSlot.getDataType());
|
||||
headStats.addColumnStats(headSlot, estimatedColumnStatistics);
|
||||
leftRowCount += childStats.get(j).getRowCount();
|
||||
}
|
||||
@ -692,12 +693,12 @@ public class StatsCalculator extends DefaultPlanVisitor<Statistics, Void> {
|
||||
}
|
||||
|
||||
private ColumnStatistic unionColumn(ColumnStatistic leftStats, double leftRowCount, ColumnStatistic rightStats,
|
||||
double rightRowCount) {
|
||||
double rightRowCount, DataType dataType) {
|
||||
ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder();
|
||||
columnStatisticBuilder.setMaxValue(Math.max(leftStats.maxValue, rightStats.maxValue));
|
||||
columnStatisticBuilder.setMinValue(Math.min(leftStats.minValue, rightStats.minValue));
|
||||
StatisticRange leftRange = StatisticRange.from(leftStats);
|
||||
StatisticRange rightRange = StatisticRange.from(rightStats);
|
||||
StatisticRange leftRange = StatisticRange.from(leftStats, dataType);
|
||||
StatisticRange rightRange = StatisticRange.from(rightStats, dataType);
|
||||
StatisticRange newRange = leftRange.union(rightRange);
|
||||
double newRowCount = leftRowCount + rightRowCount;
|
||||
double leftSize = (leftRowCount - leftStats.numNulls) * leftStats.avgSizeByte;
|
||||
|
||||
@ -148,6 +148,11 @@ public class DateLiteral extends Literal {
|
||||
return (year * 10000 + month * 100 + day) * 1000000L;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getDouble() {
|
||||
return (double) getValue();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getStringValue() {
|
||||
return String.format("%04d-%02d-%02d", year, month, day);
|
||||
|
||||
@ -247,6 +247,11 @@ public class DateTimeLiteral extends DateLiteral {
|
||||
return (year * 10000 + month * 100 + day) * 1000000L + hour * 10000 + minute * 100 + second;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getDouble() {
|
||||
return (double) getValue();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toSql() {
|
||||
return toString();
|
||||
|
||||
@ -587,4 +587,8 @@ public abstract class DataType implements AbstractDataType {
|
||||
return (Map) ImmutableMap.copyOf(promotionMap);
|
||||
}
|
||||
}
|
||||
|
||||
public double rangeLength(double high, double low) {
|
||||
return high - low;
|
||||
}
|
||||
}
|
||||
|
||||
@ -17,8 +17,30 @@
|
||||
|
||||
package org.apache.doris.nereids.types.coercion;
|
||||
|
||||
import java.time.temporal.ChronoUnit;
|
||||
import java.util.Calendar;
|
||||
|
||||
/**
|
||||
* date like type.
|
||||
*/
|
||||
public abstract class DateLikeType extends PrimitiveType {
|
||||
private Calendar toCalendar(double d) {
|
||||
//d = (year * 10000 + month * 100 + day) * 1000000L;
|
||||
int date = (int) (d / 1000000);
|
||||
int day = date % 100;
|
||||
int month = (date / 100) % 100;
|
||||
int year = date / 10000;
|
||||
Calendar calendar = Calendar.getInstance();
|
||||
calendar.set(Calendar.YEAR, year);
|
||||
calendar.set(Calendar.MONDAY, month);
|
||||
calendar.set(Calendar.DAY_OF_MONTH, day);
|
||||
return calendar;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double rangeLength(double high, double low) {
|
||||
Calendar to = toCalendar(high);
|
||||
Calendar from = toCalendar(low);
|
||||
return ChronoUnit.DAYS.between(from.toInstant(), to.toInstant());
|
||||
}
|
||||
}
|
||||
|
||||
@ -17,6 +17,8 @@
|
||||
|
||||
package org.apache.doris.statistics;
|
||||
|
||||
import org.apache.doris.nereids.types.DataType;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
public class StatisticRange {
|
||||
@ -34,10 +36,13 @@ public class StatisticRange {
|
||||
|
||||
private final double distinctValues;
|
||||
|
||||
public StatisticRange(double low, double high, double distinctValues) {
|
||||
private final DataType dataType;
|
||||
|
||||
public StatisticRange(double low, double high, double distinctValues, DataType dataType) {
|
||||
this.low = low;
|
||||
this.high = high;
|
||||
this.distinctValues = distinctValues;
|
||||
this.dataType = dataType;
|
||||
}
|
||||
|
||||
public double overlapPercentWith(StatisticRange other) {
|
||||
@ -50,7 +55,7 @@ public class StatisticRange {
|
||||
return 1.0;
|
||||
}
|
||||
|
||||
double lengthOfIntersect = Math.min(this.high, other.high) - Math.max(this.low, other.low);
|
||||
double lengthOfIntersect = dataType.rangeLength(Math.min(this.high, other.high), Math.max(this.low, other.low));
|
||||
if (Double.isInfinite(lengthOfIntersect)) {
|
||||
if (Double.isFinite(this.distinctValues) && Double.isFinite(other.distinctValues)) {
|
||||
return Math.min(other.distinctValues / this.distinctValues, 1);
|
||||
@ -73,8 +78,8 @@ public class StatisticRange {
|
||||
return INFINITE_TO_FINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR;
|
||||
}
|
||||
|
||||
public static StatisticRange empty() {
|
||||
return new StatisticRange(Double.NaN, Double.NaN, 0);
|
||||
public static StatisticRange empty(DataType dataType) {
|
||||
return new StatisticRange(Double.NaN, Double.NaN, 0, dataType);
|
||||
}
|
||||
|
||||
public boolean isEmpty() {
|
||||
@ -85,8 +90,8 @@ public class StatisticRange {
|
||||
return Double.isInfinite(low) && Double.isInfinite(high);
|
||||
}
|
||||
|
||||
public static StatisticRange from(ColumnStatistic column) {
|
||||
return new StatisticRange(column.minValue, column.maxValue, column.ndv);
|
||||
public static StatisticRange from(ColumnStatistic column, DataType dataType) {
|
||||
return new StatisticRange(column.minValue, column.maxValue, column.ndv, dataType);
|
||||
}
|
||||
|
||||
public double getLow() {
|
||||
@ -98,16 +103,16 @@ public class StatisticRange {
|
||||
}
|
||||
|
||||
public double length() {
|
||||
return this.high - this.low;
|
||||
return dataType.rangeLength(this.high, this.low);
|
||||
}
|
||||
|
||||
public StatisticRange intersect(StatisticRange other) {
|
||||
double newLow = Math.max(low, other.low);
|
||||
double newHigh = Math.min(high, other.high);
|
||||
if (newLow <= newHigh) {
|
||||
return new StatisticRange(newLow, newHigh, overlappingDistinctValues(other));
|
||||
return new StatisticRange(newLow, newHigh, overlappingDistinctValues(other), dataType);
|
||||
}
|
||||
return empty();
|
||||
return empty(dataType);
|
||||
}
|
||||
|
||||
public StatisticRange cover(StatisticRange other) {
|
||||
@ -117,9 +122,9 @@ public class StatisticRange {
|
||||
double overlapPercentOfLeft = overlapPercentWith(other);
|
||||
double overlapDistinctValuesLeft = overlapPercentOfLeft * distinctValues;
|
||||
double coveredDistinctValues = minExcludeNaN(distinctValues, overlapDistinctValuesLeft);
|
||||
return new StatisticRange(newLow, newHigh, coveredDistinctValues);
|
||||
return new StatisticRange(newLow, newHigh, coveredDistinctValues, dataType);
|
||||
}
|
||||
return empty();
|
||||
return empty(dataType);
|
||||
}
|
||||
|
||||
public StatisticRange union(StatisticRange other) {
|
||||
@ -130,7 +135,7 @@ public class StatisticRange {
|
||||
double maxOverlapNDV = Math.max(overlapNDVThis, overlapNDVOther);
|
||||
double newNDV = maxOverlapNDV + ((1 - overlapPercentThis) * distinctValues)
|
||||
+ ((1 - overlapPercentOther) * other.distinctValues);
|
||||
return new StatisticRange(Math.min(low, other.low), Math.max(high, other.high), newNDV);
|
||||
return new StatisticRange(Math.min(low, other.low), Math.max(high, other.high), newNDV, dataType);
|
||||
}
|
||||
|
||||
private double overlappingDistinctValues(StatisticRange other) {
|
||||
@ -168,7 +173,7 @@ public class StatisticRange {
|
||||
return distinctValues;
|
||||
}
|
||||
|
||||
public static StatisticRange fromColumnStatistics(ColumnStatistic columnStatistic) {
|
||||
return new StatisticRange(columnStatistic.minValue, columnStatistic.maxValue, columnStatistic.ndv);
|
||||
public static StatisticRange fromColumnStatistics(ColumnStatistic columnStatistic, DataType dataType) {
|
||||
return new StatisticRange(columnStatistic.minValue, columnStatistic.maxValue, columnStatistic.ndv, dataType);
|
||||
}
|
||||
}
|
||||
|
||||
@ -29,8 +29,10 @@ import org.apache.doris.nereids.trees.expressions.LessThanEqual;
|
||||
import org.apache.doris.nereids.trees.expressions.Not;
|
||||
import org.apache.doris.nereids.trees.expressions.Or;
|
||||
import org.apache.doris.nereids.trees.expressions.SlotReference;
|
||||
import org.apache.doris.nereids.trees.expressions.literal.DateLiteral;
|
||||
import org.apache.doris.nereids.trees.expressions.literal.DoubleLiteral;
|
||||
import org.apache.doris.nereids.trees.expressions.literal.IntegerLiteral;
|
||||
import org.apache.doris.nereids.types.DateType;
|
||||
import org.apache.doris.nereids.types.DoubleType;
|
||||
import org.apache.doris.nereids.types.IntegerType;
|
||||
import org.apache.doris.statistics.ColumnStatistic;
|
||||
@ -867,4 +869,26 @@ class FilterEstimationTest {
|
||||
Assertions.assertTrue(colStats != null);
|
||||
Assertions.assertEquals(10, colStats.ndv, 0.1);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDateRangeSelectivity() {
|
||||
DateLiteral from = new DateLiteral("1990-01-01");
|
||||
DateLiteral to = new DateLiteral("2000-01-01");
|
||||
SlotReference a = new SlotReference("a", DateType.INSTANCE);
|
||||
ColumnStatisticBuilder builder = new ColumnStatisticBuilder()
|
||||
.setNdv(100)
|
||||
.setAvgSizeByte(4)
|
||||
.setNumNulls(0)
|
||||
.setMaxValue(to.getDouble())
|
||||
.setMinValue(from.getDouble())
|
||||
.setSelectivity(1.0)
|
||||
.setCount(100);
|
||||
DateLiteral mid = new DateLiteral("1999-01-01");
|
||||
GreaterThan greaterThan = new GreaterThan(a, mid);
|
||||
Statistics stats = new Statistics(100, new HashMap<>());
|
||||
stats.addColumnStats(a, builder.build());
|
||||
FilterEstimation filterEstimation = new FilterEstimation();
|
||||
Statistics result = filterEstimation.estimate(greaterThan, stats);
|
||||
Assertions.assertEquals(result.getRowCount(), 10, 0.1);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user