From d5d6c7f8a41b24062709cdd423653a1f1ec1a50d Mon Sep 17 00:00:00 2001 From: minghong Date: Thu, 9 May 2024 22:42:01 +0800 Subject: [PATCH] [opt](nereids) optimize str-like-col range filter estimation (#34542) we have an order reserved mappping from string to double. for string column A, we have double values for A.min and A.max. when estimating A<"abc", A.min/max could be used to judge whether 'abc' is between A.min and A.max, but it cannot be used to do range estimation. suppose "abc" is mapped to double x. if we compute selectivity by formula "sel = (x-A.min)/(A.max-A.min)", we are likely to obtain extreme values. --- .../doris/nereids/stats/FilterEstimation.java | 20 +++-- .../apache/doris/nereids/types/TimeType.java | 3 +- .../doris/nereids/types/TimeV2Type.java | 3 +- .../nereids/types/coercion/DateLikeType.java | 2 +- .../nereids/types/coercion/NumericType.java | 2 +- .../nereids/types/coercion/RangeScalable.java | 30 +++++++ .../nereids/stats/FilterEstimationTest.java | 89 +++++++++++++++++++ .../nereids_ssb_shape_sf100_p0/shape/q2.2.out | 35 ++++---- 8 files changed, 157 insertions(+), 27 deletions(-) create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/RangeScalable.java diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java index e13eed8954..45e6dcd2ab 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java @@ -40,6 +40,7 @@ import org.apache.doris.nereids.trees.expressions.SlotReference; import org.apache.doris.nereids.trees.expressions.functions.Function; import org.apache.doris.nereids.trees.expressions.literal.Literal; import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; +import org.apache.doris.nereids.types.coercion.RangeScalable; import org.apache.doris.statistics.ColumnStatistic; import org.apache.doris.statistics.ColumnStatisticBuilder; import org.apache.doris.statistics.StatisticRange; @@ -494,6 +495,9 @@ public class FilterEstimation extends ExpressionVisitor (2010,2012), sel=1 + Statistics agrtb = new FilterEstimation().estimate(new GreaterThan(a, b), baseStats); + Assertions.assertEquals(100, agrtb.getRowCount()); + // (2020-2022) < (2010,2012), sel=0 + Statistics alessb = new FilterEstimation().estimate(new LessThan(a, b), baseStats); + Assertions.assertEquals(0, alessb.getRowCount()); + + // (2020-2022) > (2010-2021), sel = DEFAULT (0.5) + Statistics agrtc = new FilterEstimation().estimate(new GreaterThan(a, c), baseStats); + Assertions.assertEquals(50, agrtc.getRowCount()); + } } diff --git a/regression-test/data/nereids_ssb_shape_sf100_p0/shape/q2.2.out b/regression-test/data/nereids_ssb_shape_sf100_p0/shape/q2.2.out index d3b4e0af1a..f57f1958aa 100644 --- a/regression-test/data/nereids_ssb_shape_sf100_p0/shape/q2.2.out +++ b/regression-test/data/nereids_ssb_shape_sf100_p0/shape/q2.2.out @@ -4,26 +4,25 @@ PhysicalResultSink --PhysicalQuickSort[MERGE_SORT] ----PhysicalDistribute[DistributionSpecGather] ------PhysicalQuickSort[LOCAL_SORT] ---------PhysicalProject -----------hashAgg[GLOBAL] -------------PhysicalDistribute[DistributionSpecHash] ---------------hashAgg[LOCAL] -----------------PhysicalProject -------------------hashJoin[INNER_JOIN] hashCondition=((lineorder.lo_orderdate = dates.d_datekey)) otherCondition=() build RFs:RF2 d_datekey->[lo_orderdate] ---------------------PhysicalProject -----------------------hashJoin[INNER_JOIN] hashCondition=((lineorder.lo_suppkey = supplier.s_suppkey)) otherCondition=() build RFs:RF1 s_suppkey->[lo_suppkey] -------------------------hashJoin[INNER_JOIN] hashCondition=((lineorder.lo_partkey = part.p_partkey)) otherCondition=() build RFs:RF0 p_partkey->[lo_partkey] ---------------------------PhysicalProject -----------------------------PhysicalOlapScan[lineorder] apply RFs: RF0 RF1 RF2 ---------------------------PhysicalDistribute[DistributionSpecReplicated] -----------------------------PhysicalProject -------------------------------filter((part.p_brand <= 'MFGR#2228') and (part.p_brand >= 'MFGR#2221')) ---------------------------------PhysicalOlapScan[part] +--------hashAgg[GLOBAL] +----------PhysicalDistribute[DistributionSpecHash] +------------hashAgg[LOCAL] +--------------PhysicalProject +----------------hashJoin[INNER_JOIN] hashCondition=((lineorder.lo_orderdate = dates.d_datekey)) otherCondition=() build RFs:RF2 d_datekey->[lo_orderdate] +------------------PhysicalProject +--------------------hashJoin[INNER_JOIN] hashCondition=((lineorder.lo_partkey = part.p_partkey)) otherCondition=() build RFs:RF1 p_partkey->[lo_partkey] +----------------------hashJoin[INNER_JOIN] hashCondition=((lineorder.lo_suppkey = supplier.s_suppkey)) otherCondition=() build RFs:RF0 s_suppkey->[lo_suppkey] +------------------------PhysicalProject +--------------------------PhysicalOlapScan[lineorder] apply RFs: RF0 RF1 RF2 ------------------------PhysicalDistribute[DistributionSpecReplicated] --------------------------PhysicalProject ----------------------------filter((supplier.s_region = 'ASIA')) ------------------------------PhysicalOlapScan[supplier] ---------------------PhysicalDistribute[DistributionSpecReplicated] -----------------------PhysicalProject -------------------------PhysicalOlapScan[dates] +----------------------PhysicalDistribute[DistributionSpecReplicated] +------------------------PhysicalProject +--------------------------filter((part.p_brand <= 'MFGR#2228') and (part.p_brand >= 'MFGR#2221')) +----------------------------PhysicalOlapScan[part] +------------------PhysicalDistribute[DistributionSpecReplicated] +--------------------PhysicalProject +----------------------PhysicalOlapScan[dates]