[opt](nereids) optimize str-like-col range filter estimation (#34542)
we have an order reserved mappping from string to double. for string column A, we have double values for A.min and A.max. when estimating A<"abc", A.min/max could be used to judge whether 'abc' is between A.min and A.max, but it cannot be used to do range estimation. suppose "abc" is mapped to double x. if we compute selectivity by formula "sel = (x-A.min)/(A.max-A.min)", we are likely to obtain extreme values.
This commit is contained in:
@ -40,6 +40,7 @@ import org.apache.doris.nereids.trees.expressions.SlotReference;
|
||||
import org.apache.doris.nereids.trees.expressions.functions.Function;
|
||||
import org.apache.doris.nereids.trees.expressions.literal.Literal;
|
||||
import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
|
||||
import org.apache.doris.nereids.types.coercion.RangeScalable;
|
||||
import org.apache.doris.statistics.ColumnStatistic;
|
||||
import org.apache.doris.statistics.ColumnStatisticBuilder;
|
||||
import org.apache.doris.statistics.StatisticRange;
|
||||
@ -494,6 +495,9 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo
|
||||
.setNdv(intersectRange.getDistinctValues())
|
||||
.setNumNulls(0);
|
||||
double sel = leftRange.overlapPercentWith(rightRange);
|
||||
if (!(leftExpr.getDataType() instanceof RangeScalable) && (sel != 0.0 && sel != 1.0)) {
|
||||
sel = DEFAULT_INEQUALITY_COEFFICIENT;
|
||||
}
|
||||
sel = getNotNullSelectivity(leftStats, sel);
|
||||
updatedStatistics = context.statistics.withSel(sel);
|
||||
leftColumnStatisticBuilder.setCount(updatedStatistics.getRowCount());
|
||||
@ -550,8 +554,9 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo
|
||||
}
|
||||
|
||||
double leftOverlapPercent = leftRange.overlapPercentWith(rightRange);
|
||||
// Left always greater than right
|
||||
if (leftOverlapPercent == 0) {
|
||||
|
||||
if (leftOverlapPercent == 0.0) {
|
||||
// Left always greater than right
|
||||
return context.statistics.withRowCount(0.0);
|
||||
}
|
||||
StatisticRange leftAlwaysLessThanRightRange = new StatisticRange(leftStats.minValue, leftStats.minExpr,
|
||||
@ -580,9 +585,14 @@ public class FilterEstimation extends ExpressionVisitor<Statistics, EstimationCo
|
||||
.setNdv(rightStats.ndv * (rightAlwaysGreaterRangeFraction + rightOverlappingRangeFraction))
|
||||
.setNumNulls(0)
|
||||
.build();
|
||||
double sel = leftAlwaysLessThanRightPercent
|
||||
+ leftOverlapPercent * rightOverlappingRangeFraction * DEFAULT_INEQUALITY_COEFFICIENT
|
||||
+ leftOverlapPercent * rightAlwaysGreaterRangeFraction;
|
||||
double sel = DEFAULT_INEQUALITY_COEFFICIENT;
|
||||
if (leftExpr.getDataType() instanceof RangeScalable) {
|
||||
sel = leftAlwaysLessThanRightPercent
|
||||
+ leftOverlapPercent * rightOverlappingRangeFraction * DEFAULT_INEQUALITY_COEFFICIENT
|
||||
+ leftOverlapPercent * rightAlwaysGreaterRangeFraction;
|
||||
} else if (leftOverlapPercent == 1.0) {
|
||||
sel = 1.0;
|
||||
}
|
||||
context.addKeyIfSlot(leftExpr);
|
||||
context.addKeyIfSlot(rightExpr);
|
||||
return context.statistics.withSel(sel)
|
||||
|
||||
@ -19,11 +19,12 @@ package org.apache.doris.nereids.types;
|
||||
|
||||
import org.apache.doris.catalog.Type;
|
||||
import org.apache.doris.nereids.types.coercion.PrimitiveType;
|
||||
import org.apache.doris.nereids.types.coercion.RangeScalable;
|
||||
|
||||
/**
|
||||
* Datetime type in Nereids.
|
||||
*/
|
||||
public class TimeType extends PrimitiveType {
|
||||
public class TimeType extends PrimitiveType implements RangeScalable {
|
||||
|
||||
public static final TimeType INSTANCE = new TimeType();
|
||||
|
||||
|
||||
@ -20,11 +20,12 @@ package org.apache.doris.nereids.types;
|
||||
import org.apache.doris.catalog.ScalarType;
|
||||
import org.apache.doris.catalog.Type;
|
||||
import org.apache.doris.nereids.types.coercion.PrimitiveType;
|
||||
import org.apache.doris.nereids.types.coercion.RangeScalable;
|
||||
|
||||
/**
|
||||
* Datetime type in Nereids.
|
||||
*/
|
||||
public class TimeV2Type extends PrimitiveType {
|
||||
public class TimeV2Type extends PrimitiveType implements RangeScalable {
|
||||
|
||||
public static final TimeV2Type INSTANCE = new TimeV2Type();
|
||||
|
||||
|
||||
@ -33,7 +33,7 @@ import java.time.LocalDateTime;
|
||||
/**
|
||||
* date like type.
|
||||
*/
|
||||
public abstract class DateLikeType extends PrimitiveType {
|
||||
public abstract class DateLikeType extends PrimitiveType implements RangeScalable {
|
||||
|
||||
protected LocalDate toLocalDate(double d) {
|
||||
// d = (year * 10000 + month * 100 + day) * 1000000L;
|
||||
|
||||
@ -24,7 +24,7 @@ import org.apache.doris.nereids.types.DoubleType;
|
||||
/**
|
||||
* Abstract class for all numeric type in Nereids.
|
||||
*/
|
||||
public class NumericType extends PrimitiveType {
|
||||
public class NumericType extends PrimitiveType implements RangeScalable {
|
||||
|
||||
public static final NumericType INSTANCE = new NumericType();
|
||||
|
||||
|
||||
@ -0,0 +1,30 @@
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
package org.apache.doris.nereids.types.coercion;
|
||||
|
||||
/**
|
||||
* numeric type/ date related type are range scalable
|
||||
* RangeScalable Column can be estimated by filter like "A < 10" more accurate.
|
||||
* For example, for a given relation R, which contains 10 rows. R.A in (1, 100),
|
||||
* the selectivity of filter "A<10" is "(10-1) / (100 -1)"
|
||||
* But for string column A, the filter selectivity of "A<'abc'" can not be estimated by range, although we could
|
||||
* have an order reserved mapping from string value to double.
|
||||
*
|
||||
*/
|
||||
public interface RangeScalable {
|
||||
}
|
||||
Reference in New Issue
Block a user