[opt](nereids) optimize str-like-col range filter estimation (#34542)

we have an order reserved mappping from string to double.
for string column A, we have double values for A.min and A.max.
when estimating A<"abc", A.min/max could be used to judge whether 'abc' is between A.min and A.max, but it cannot be used to do range estimation. suppose "abc" is mapped to double x. if we compute selectivity by formula "sel = (x-A.min)/(A.max-A.min)", we are likely to obtain extreme values.
This commit is contained in:
minghong
2024-05-09 22:42:01 +08:00
committed by yiguolei
parent e2ea54c0a7
commit d5d6c7f8a4
8 changed files with 157 additions and 27 deletions

View File

@ -40,6 +40,7 @@ import org.apache.doris.nereids.trees.expressions.SlotReference;
import org.apache.doris.nereids.trees.expressions.functions.Function;
import org.apache.doris.nereids.trees.expressions.literal.Literal;
import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
import org.apache.doris.nereids.types.coercion.RangeScalable;
import org.apache.doris.statistics.ColumnStatistic;
import org.apache.doris.statistics.ColumnStatisticBuilder;
import org.apache.doris.statistics.StatisticRange;
@ -494,6 +495,9 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo
.setNdv(intersectRange.getDistinctValues())
.setNumNulls(0);
double sel = leftRange.overlapPercentWith(rightRange);
if (!(leftExpr.getDataType() instanceof RangeScalable) && (sel != 0.0 && sel != 1.0)) {
sel = DEFAULT_INEQUALITY_COEFFICIENT;
}
sel = getNotNullSelectivity(leftStats, sel);
updatedStatistics = context.statistics.withSel(sel);
leftColumnStatisticBuilder.setCount(updatedStatistics.getRowCount());
@ -550,8 +554,9 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo
}
double leftOverlapPercent = leftRange.overlapPercentWith(rightRange);
// Left always greater than right
if (leftOverlapPercent == 0) {
if (leftOverlapPercent == 0.0) {
// Left always greater than right
return context.statistics.withRowCount(0.0);
}
StatisticRange leftAlwaysLessThanRightRange = new StatisticRange(leftStats.minValue, leftStats.minExpr,
@ -580,9 +585,14 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo
.setNdv(rightStats.ndv * (rightAlwaysGreaterRangeFraction + rightOverlappingRangeFraction))
.setNumNulls(0)
.build();
double sel = leftAlwaysLessThanRightPercent
+ leftOverlapPercent * rightOverlappingRangeFraction * DEFAULT_INEQUALITY_COEFFICIENT
+ leftOverlapPercent * rightAlwaysGreaterRangeFraction;
double sel = DEFAULT_INEQUALITY_COEFFICIENT;
if (leftExpr.getDataType() instanceof RangeScalable) {
sel = leftAlwaysLessThanRightPercent
+ leftOverlapPercent * rightOverlappingRangeFraction * DEFAULT_INEQUALITY_COEFFICIENT
+ leftOverlapPercent * rightAlwaysGreaterRangeFraction;
} else if (leftOverlapPercent == 1.0) {
sel = 1.0;
}
context.addKeyIfSlot(leftExpr);
context.addKeyIfSlot(rightExpr);
return context.statistics.withSel(sel)

View File

@ -19,11 +19,12 @@ package org.apache.doris.nereids.types;
import org.apache.doris.catalog.Type;
import org.apache.doris.nereids.types.coercion.PrimitiveType;
import org.apache.doris.nereids.types.coercion.RangeScalable;
/**
* Datetime type in Nereids.
*/
public class TimeType extends PrimitiveType {
public class TimeType extends PrimitiveType implements RangeScalable {
public static final TimeType INSTANCE = new TimeType();

View File

@ -20,11 +20,12 @@ package org.apache.doris.nereids.types;
import org.apache.doris.catalog.ScalarType;
import org.apache.doris.catalog.Type;
import org.apache.doris.nereids.types.coercion.PrimitiveType;
import org.apache.doris.nereids.types.coercion.RangeScalable;
/**
* Datetime type in Nereids.
*/
public class TimeV2Type extends PrimitiveType {
public class TimeV2Type extends PrimitiveType implements RangeScalable {
public static final TimeV2Type INSTANCE = new TimeV2Type();

View File

@ -33,7 +33,7 @@ import java.time.LocalDateTime;
/**
* date like type.
*/
public abstract class DateLikeType extends PrimitiveType {
public abstract class DateLikeType extends PrimitiveType implements RangeScalable {
protected LocalDate toLocalDate(double d) {
// d = (year * 10000 + month * 100 + day) * 1000000L;

View File

@ -24,7 +24,7 @@ import org.apache.doris.nereids.types.DoubleType;
/**
* Abstract class for all numeric type in Nereids.
*/
public class NumericType extends PrimitiveType {
public class NumericType extends PrimitiveType implements RangeScalable {
public static final NumericType INSTANCE = new NumericType();

View File

@ -0,0 +1,30 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
package org.apache.doris.nereids.types.coercion;
/**
* numeric type/ date related type are range scalable
* RangeScalable Column can be estimated by filter like "A < 10" more accurate.
* For example, for a given relation R, which contains 10 rows. R.A in (1, 100),
* the selectivity of filter "A<10" is "(10-1) / (100 -1)"
* But for string column A, the filter selectivity of "A<'abc'" can not be estimated by range, although we could
* have an order reserved mapping from string value to double.
*
*/
public interface RangeScalable {
}