diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java index 29e30b30f3..f8298871f0 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java @@ -267,8 +267,8 @@ public class JoinEstimation { } private static Statistics estimateSemiOrAnti(Statistics leftStats, Statistics rightStats, Join join) { - if (hashJoinConditionContainsUnknownColumnStats(leftStats, rightStats, join)) { - double sel = computeSelectivityForBuildSideWhenColStatsUnknown(rightStats, join); + if (hashJoinConditionContainsUnknownColumnStats(leftStats, rightStats, join) || join.isMarkJoin()) { + double sel = join.isMarkJoin() ? 1.0 : computeSelectivityForBuildSideWhenColStatsUnknown(rightStats, join); if (join.getJoinType().isLeftSemiOrAntiJoin()) { return new StatisticsBuilder().setRowCount(leftStats.getRowCount() * sel) .putColumnStatistics(leftStats.columnStatistics()) diff --git a/regression-test/data/nereids_tpcds_shape_sf1000_p0/bs_downgrade_shape/query45.out b/regression-test/data/nereids_tpcds_shape_sf1000_p0/bs_downgrade_shape/query45.out index 091bda5f39..96115bcbfd 100644 --- a/regression-test/data/nereids_tpcds_shape_sf1000_p0/bs_downgrade_shape/query45.out +++ b/regression-test/data/nereids_tpcds_shape_sf1000_p0/bs_downgrade_shape/query45.out @@ -9,30 +9,34 @@ PhysicalResultSink ------------hashAgg[LOCAL] --------------PhysicalProject ----------------filter((substring(ca_zip, 1, 5) IN ('80348', '81792', '83405', '85392', '85460', '85669', '86197', '86475', '88274') OR $c$1)) -------------------hashJoin[INNER_JOIN] hashCondition=((customer.c_current_addr_sk = customer_address.ca_address_sk)) otherCondition=() build RFs:RF3 c_current_addr_sk->[ca_address_sk] ---------------------PhysicalProject -----------------------PhysicalOlapScan[customer_address] apply RFs: RF3 ---------------------PhysicalDistribute[DistributionSpecReplicated] +------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[ws_item_sk] +--------------------PhysicalDistribute[DistributionSpecHash] ----------------------PhysicalProject -------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk)) otherCondition=() build RFs:RF2 ws_bill_customer_sk->[c_customer_sk] ---------------------------PhysicalProject -----------------------------PhysicalOlapScan[customer] apply RFs: RF2 ---------------------------PhysicalDistribute[DistributionSpecReplicated] +------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk)) otherCondition=() build RFs:RF2 c_customer_sk->[ws_bill_customer_sk] +--------------------------PhysicalDistribute[DistributionSpecHash] ----------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF1 d_date_sk->[ws_sold_date_sk] -------------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF0 i_item_sk->[ws_item_sk] ---------------------------------PhysicalProject -----------------------------------PhysicalOlapScan[web_sales] apply RFs: RF0 RF1 ---------------------------------PhysicalDistribute[DistributionSpecReplicated] -----------------------------------PhysicalProject -------------------------------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((item.i_item_id = item.i_item_id)) otherCondition=() ---------------------------------------PhysicalProject -----------------------------------------PhysicalOlapScan[item] ---------------------------------------PhysicalDistribute[DistributionSpecReplicated] -----------------------------------------PhysicalProject -------------------------------------------filter(i_item_sk IN (11, 13, 17, 19, 2, 23, 29, 3, 5, 7)) ---------------------------------------------PhysicalOlapScan[item] +------------------------------PhysicalProject +--------------------------------PhysicalOlapScan[web_sales] apply RFs: RF1 RF2 RF3 ------------------------------PhysicalDistribute[DistributionSpecReplicated] --------------------------------PhysicalProject ----------------------------------filter((date_dim.d_qoy = 1) and (date_dim.d_year = 2000)) ------------------------------------PhysicalOlapScan[date_dim] +--------------------------PhysicalDistribute[DistributionSpecHash] +----------------------------PhysicalProject +------------------------------hashJoin[INNER_JOIN] hashCondition=((customer.c_current_addr_sk = customer_address.ca_address_sk)) otherCondition=() build RFs:RF0 ca_address_sk->[c_current_addr_sk] +--------------------------------PhysicalDistribute[DistributionSpecHash] +----------------------------------PhysicalProject +------------------------------------PhysicalOlapScan[customer] apply RFs: RF0 +--------------------------------PhysicalDistribute[DistributionSpecHash] +----------------------------------PhysicalProject +------------------------------------PhysicalOlapScan[customer_address] +--------------------PhysicalDistribute[DistributionSpecHash] +----------------------PhysicalProject +------------------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((item.i_item_id = item.i_item_id)) otherCondition=() +--------------------------PhysicalProject +----------------------------PhysicalOlapScan[item] +--------------------------PhysicalDistribute[DistributionSpecReplicated] +----------------------------PhysicalProject +------------------------------filter(i_item_sk IN (11, 13, 17, 19, 2, 23, 29, 3, 5, 7)) +--------------------------------PhysicalOlapScan[item] diff --git a/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query45.out b/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query45.out index cb8a7ac592..96115bcbfd 100644 --- a/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query45.out +++ b/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query45.out @@ -9,30 +9,34 @@ PhysicalResultSink ------------hashAgg[LOCAL] --------------PhysicalProject ----------------filter((substring(ca_zip, 1, 5) IN ('80348', '81792', '83405', '85392', '85460', '85669', '86197', '86475', '88274') OR $c$1)) -------------------hashJoin[INNER_JOIN] hashCondition=((customer.c_current_addr_sk = customer_address.ca_address_sk)) otherCondition=() build RFs:RF3 c_current_addr_sk->[ca_address_sk] ---------------------PhysicalProject -----------------------PhysicalOlapScan[customer_address] apply RFs: RF3 +------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF3 i_item_sk->[ws_item_sk] --------------------PhysicalDistribute[DistributionSpecHash] ----------------------PhysicalProject -------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk)) otherCondition=() build RFs:RF2 ws_bill_customer_sk->[c_customer_sk] ---------------------------PhysicalProject -----------------------------PhysicalOlapScan[customer] apply RFs: RF2 ---------------------------PhysicalDistribute[DistributionSpecReplicated] +------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk)) otherCondition=() build RFs:RF2 c_customer_sk->[ws_bill_customer_sk] +--------------------------PhysicalDistribute[DistributionSpecHash] ----------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF1 d_date_sk->[ws_sold_date_sk] -------------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF0 i_item_sk->[ws_item_sk] ---------------------------------PhysicalProject -----------------------------------PhysicalOlapScan[web_sales] apply RFs: RF0 RF1 ---------------------------------PhysicalDistribute[DistributionSpecReplicated] -----------------------------------PhysicalProject -------------------------------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((item.i_item_id = item.i_item_id)) otherCondition=() ---------------------------------------PhysicalProject -----------------------------------------PhysicalOlapScan[item] ---------------------------------------PhysicalDistribute[DistributionSpecReplicated] -----------------------------------------PhysicalProject -------------------------------------------filter(i_item_sk IN (11, 13, 17, 19, 2, 23, 29, 3, 5, 7)) ---------------------------------------------PhysicalOlapScan[item] +------------------------------PhysicalProject +--------------------------------PhysicalOlapScan[web_sales] apply RFs: RF1 RF2 RF3 ------------------------------PhysicalDistribute[DistributionSpecReplicated] --------------------------------PhysicalProject ----------------------------------filter((date_dim.d_qoy = 1) and (date_dim.d_year = 2000)) ------------------------------------PhysicalOlapScan[date_dim] +--------------------------PhysicalDistribute[DistributionSpecHash] +----------------------------PhysicalProject +------------------------------hashJoin[INNER_JOIN] hashCondition=((customer.c_current_addr_sk = customer_address.ca_address_sk)) otherCondition=() build RFs:RF0 ca_address_sk->[c_current_addr_sk] +--------------------------------PhysicalDistribute[DistributionSpecHash] +----------------------------------PhysicalProject +------------------------------------PhysicalOlapScan[customer] apply RFs: RF0 +--------------------------------PhysicalDistribute[DistributionSpecHash] +----------------------------------PhysicalProject +------------------------------------PhysicalOlapScan[customer_address] +--------------------PhysicalDistribute[DistributionSpecHash] +----------------------PhysicalProject +------------------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((item.i_item_id = item.i_item_id)) otherCondition=() +--------------------------PhysicalProject +----------------------------PhysicalOlapScan[item] +--------------------------PhysicalDistribute[DistributionSpecReplicated] +----------------------------PhysicalProject +------------------------------filter(i_item_sk IN (11, 13, 17, 19, 2, 23, 29, 3, 5, 7)) +--------------------------------PhysicalOlapScan[item] diff --git a/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query51.out b/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query51.out index 3cacbaeedc..06a6c3d034 100644 --- a/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query51.out +++ b/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query51.out @@ -19,9 +19,9 @@ PhysicalResultSink --------------------------------PhysicalDistribute[DistributionSpecHash] ----------------------------------hashAgg[LOCAL] ------------------------------------PhysicalProject ---------------------------------------hashJoin[INNER_JOIN] hashCondition=((store_sales.ss_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF1 d_date_sk->[ss_sold_date_sk] +--------------------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF1 d_date_sk->[ws_sold_date_sk] ----------------------------------------PhysicalProject -------------------------------------------PhysicalOlapScan[store_sales] apply RFs: RF1 +------------------------------------------PhysicalOlapScan[web_sales] apply RFs: RF1 ----------------------------------------PhysicalDistribute[DistributionSpecReplicated] ------------------------------------------PhysicalProject --------------------------------------------filter((date_dim.d_month_seq <= 1223) and (date_dim.d_month_seq >= 1212)) @@ -35,9 +35,9 @@ PhysicalResultSink --------------------------------PhysicalDistribute[DistributionSpecHash] ----------------------------------hashAgg[LOCAL] ------------------------------------PhysicalProject ---------------------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF0 d_date_sk->[ws_sold_date_sk] +--------------------------------------hashJoin[INNER_JOIN] hashCondition=((store_sales.ss_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF0 d_date_sk->[ss_sold_date_sk] ----------------------------------------PhysicalProject -------------------------------------------PhysicalOlapScan[web_sales] apply RFs: RF0 +------------------------------------------PhysicalOlapScan[store_sales] apply RFs: RF0 ----------------------------------------PhysicalDistribute[DistributionSpecReplicated] ------------------------------------------PhysicalProject --------------------------------------------filter((date_dim.d_month_seq <= 1223) and (date_dim.d_month_seq >= 1212)) diff --git a/regression-test/data/nereids_tpcds_shape_sf100_p0/rf_prune/query45.out b/regression-test/data/nereids_tpcds_shape_sf100_p0/rf_prune/query45.out index b3b09807b2..8a288f8102 100644 --- a/regression-test/data/nereids_tpcds_shape_sf100_p0/rf_prune/query45.out +++ b/regression-test/data/nereids_tpcds_shape_sf100_p0/rf_prune/query45.out @@ -9,30 +9,34 @@ PhysicalResultSink ------------hashAgg[LOCAL] --------------PhysicalProject ----------------filter((substring(ca_zip, 1, 5) IN ('80348', '81792', '83405', '85392', '85460', '85669', '86197', '86475', '88274') OR $c$1)) -------------------hashJoin[INNER_JOIN] hashCondition=((customer.c_current_addr_sk = customer_address.ca_address_sk)) otherCondition=() build RFs:RF3 c_current_addr_sk->[ca_address_sk] ---------------------PhysicalProject -----------------------PhysicalOlapScan[customer_address] apply RFs: RF3 +------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk)) otherCondition=() --------------------PhysicalDistribute[DistributionSpecHash] ----------------------PhysicalProject -------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk)) otherCondition=() build RFs:RF2 ws_bill_customer_sk->[c_customer_sk] ---------------------------PhysicalProject -----------------------------PhysicalOlapScan[customer] apply RFs: RF2 ---------------------------PhysicalDistribute[DistributionSpecReplicated] +------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF2 i_item_sk->[ws_item_sk] +--------------------------PhysicalDistribute[DistributionSpecHash] ----------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF1 d_date_sk->[ws_sold_date_sk] -------------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF0 i_item_sk->[ws_item_sk] ---------------------------------PhysicalProject -----------------------------------PhysicalOlapScan[web_sales] apply RFs: RF0 RF1 ---------------------------------PhysicalDistribute[DistributionSpecReplicated] -----------------------------------PhysicalProject -------------------------------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((item.i_item_id = item.i_item_id)) otherCondition=() ---------------------------------------PhysicalProject -----------------------------------------PhysicalOlapScan[item] ---------------------------------------PhysicalDistribute[DistributionSpecReplicated] -----------------------------------------PhysicalProject -------------------------------------------filter(i_item_sk IN (11, 13, 17, 19, 2, 23, 29, 3, 5, 7)) ---------------------------------------------PhysicalOlapScan[item] +------------------------------PhysicalProject +--------------------------------PhysicalOlapScan[web_sales] apply RFs: RF1 RF2 ------------------------------PhysicalDistribute[DistributionSpecReplicated] --------------------------------PhysicalProject ----------------------------------filter((date_dim.d_qoy = 2) and (date_dim.d_year = 2000)) ------------------------------------PhysicalOlapScan[date_dim] +--------------------------PhysicalDistribute[DistributionSpecHash] +----------------------------PhysicalProject +------------------------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((item.i_item_id = item.i_item_id)) otherCondition=() +--------------------------------PhysicalProject +----------------------------------PhysicalOlapScan[item] +--------------------------------PhysicalDistribute[DistributionSpecReplicated] +----------------------------------PhysicalProject +------------------------------------filter(i_item_sk IN (11, 13, 17, 19, 2, 23, 29, 3, 5, 7)) +--------------------------------------PhysicalOlapScan[item] +--------------------PhysicalDistribute[DistributionSpecHash] +----------------------PhysicalProject +------------------------hashJoin[INNER_JOIN] hashCondition=((customer.c_current_addr_sk = customer_address.ca_address_sk)) otherCondition=() +--------------------------PhysicalDistribute[DistributionSpecHash] +----------------------------PhysicalProject +------------------------------PhysicalOlapScan[customer] +--------------------------PhysicalDistribute[DistributionSpecHash] +----------------------------PhysicalProject +------------------------------PhysicalOlapScan[customer_address] diff --git a/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query45.out b/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query45.out index b3b09807b2..1c368d766e 100644 --- a/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query45.out +++ b/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query45.out @@ -9,30 +9,34 @@ PhysicalResultSink ------------hashAgg[LOCAL] --------------PhysicalProject ----------------filter((substring(ca_zip, 1, 5) IN ('80348', '81792', '83405', '85392', '85460', '85669', '86197', '86475', '88274') OR $c$1)) -------------------hashJoin[INNER_JOIN] hashCondition=((customer.c_current_addr_sk = customer_address.ca_address_sk)) otherCondition=() build RFs:RF3 c_current_addr_sk->[ca_address_sk] ---------------------PhysicalProject -----------------------PhysicalOlapScan[customer_address] apply RFs: RF3 +------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk)) otherCondition=() build RFs:RF3 c_customer_sk->[ws_bill_customer_sk] --------------------PhysicalDistribute[DistributionSpecHash] ----------------------PhysicalProject -------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_bill_customer_sk = customer.c_customer_sk)) otherCondition=() build RFs:RF2 ws_bill_customer_sk->[c_customer_sk] ---------------------------PhysicalProject -----------------------------PhysicalOlapScan[customer] apply RFs: RF2 ---------------------------PhysicalDistribute[DistributionSpecReplicated] +------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF2 i_item_sk->[ws_item_sk] +--------------------------PhysicalDistribute[DistributionSpecHash] ----------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF1 d_date_sk->[ws_sold_date_sk] -------------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_item_sk = item.i_item_sk)) otherCondition=() build RFs:RF0 i_item_sk->[ws_item_sk] ---------------------------------PhysicalProject -----------------------------------PhysicalOlapScan[web_sales] apply RFs: RF0 RF1 ---------------------------------PhysicalDistribute[DistributionSpecReplicated] -----------------------------------PhysicalProject -------------------------------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((item.i_item_id = item.i_item_id)) otherCondition=() ---------------------------------------PhysicalProject -----------------------------------------PhysicalOlapScan[item] ---------------------------------------PhysicalDistribute[DistributionSpecReplicated] -----------------------------------------PhysicalProject -------------------------------------------filter(i_item_sk IN (11, 13, 17, 19, 2, 23, 29, 3, 5, 7)) ---------------------------------------------PhysicalOlapScan[item] +------------------------------PhysicalProject +--------------------------------PhysicalOlapScan[web_sales] apply RFs: RF1 RF2 RF3 ------------------------------PhysicalDistribute[DistributionSpecReplicated] --------------------------------PhysicalProject ----------------------------------filter((date_dim.d_qoy = 2) and (date_dim.d_year = 2000)) ------------------------------------PhysicalOlapScan[date_dim] +--------------------------PhysicalDistribute[DistributionSpecHash] +----------------------------PhysicalProject +------------------------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((item.i_item_id = item.i_item_id)) otherCondition=() +--------------------------------PhysicalProject +----------------------------------PhysicalOlapScan[item] +--------------------------------PhysicalDistribute[DistributionSpecReplicated] +----------------------------------PhysicalProject +------------------------------------filter(i_item_sk IN (11, 13, 17, 19, 2, 23, 29, 3, 5, 7)) +--------------------------------------PhysicalOlapScan[item] +--------------------PhysicalDistribute[DistributionSpecHash] +----------------------PhysicalProject +------------------------hashJoin[INNER_JOIN] hashCondition=((customer.c_current_addr_sk = customer_address.ca_address_sk)) otherCondition=() build RFs:RF0 ca_address_sk->[c_current_addr_sk] +--------------------------PhysicalDistribute[DistributionSpecHash] +----------------------------PhysicalProject +------------------------------PhysicalOlapScan[customer] apply RFs: RF0 +--------------------------PhysicalDistribute[DistributionSpecHash] +----------------------------PhysicalProject +------------------------------PhysicalOlapScan[customer_address]