diff --git a/be/src/pipeline/exec/hashjoin_build_sink.cpp b/be/src/pipeline/exec/hashjoin_build_sink.cpp index 4ea3b00686..0c94ffe8a5 100644 --- a/be/src/pipeline/exec/hashjoin_build_sink.cpp +++ b/be/src/pipeline/exec/hashjoin_build_sink.cpp @@ -76,6 +76,12 @@ Status HashJoinBuildSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo _shared_hash_table_dependency->block(); p._shared_hashtable_controller->append_dependency(p.node_id(), _shared_hash_table_dependency); + } else { + if ((p._join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN || + p._join_op == TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN) && + p._have_other_join_conjunct) { + _build_indexes_null = std::make_shared>(); + } } _memory_usage_counter = ADD_LABEL_COUNTER(profile(), "MemoryUsage"); @@ -250,44 +256,46 @@ Status HashJoinBuildSinkLocalState::process_build_block(RuntimeState* state, st = std::visit( Overload {[&](std::monostate& arg, auto join_op, auto has_null_value, - auto short_circuit_for_null_in_build_side) -> Status { + auto short_circuit_for_null_in_build_side, + auto with_other_conjuncts) -> Status { LOG(FATAL) << "FATAL: uninited hash table"; __builtin_unreachable(); return Status::OK(); }, [&](auto&& arg, auto&& join_op, auto has_null_value, - auto short_circuit_for_null_in_build_side) -> Status { + auto short_circuit_for_null_in_build_side, + auto with_other_conjuncts) -> Status { using HashTableCtxType = std::decay_t; using JoinOpType = std::decay_t; vectorized::ProcessHashTableBuild hash_table_build_process(rows, block, raw_ptrs, this, state->batch_size(), state); - return hash_table_build_process - .template run( - arg, - has_null_value || short_circuit_for_null_in_build_side - ? &null_map_val->get_data() - : nullptr, - &_shared_state->_has_null_in_build_side); + return hash_table_build_process.template run< + JoinOpType::value, has_null_value, + short_circuit_for_null_in_build_side, with_other_conjuncts>( + arg, + has_null_value || short_circuit_for_null_in_build_side + ? &null_map_val->get_data() + : nullptr, + &_shared_state->_has_null_in_build_side); }}, *_shared_state->hash_table_variants, _shared_state->join_op_variants, vectorized::make_bool_variant(_build_side_ignore_null), - vectorized::make_bool_variant(p._short_circuit_for_null_in_build_side)); + vectorized::make_bool_variant(p._short_circuit_for_null_in_build_side), + vectorized::make_bool_variant((p._have_other_join_conjunct))); return st; } void HashJoinBuildSinkLocalState::_set_build_ignore_flag(vectorized::Block& block, const std::vector& res_col_ids) { + auto& p = _parent->cast(); for (size_t i = 0; i < _build_expr_ctxs.size(); ++i) { - if (!_shared_state->is_null_safe_eq_join[i]) { + if (!_shared_state->is_null_safe_eq_join[i] && !p._short_circuit_for_null_in_build_side) { const auto* column = block.get_by_position(res_col_ids[i]).column.get(); if (check_and_get_column(*column)) { - _build_side_ignore_null |= (_parent->cast()._join_op != - TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN && - !_shared_state->store_null_in_hash_table[i]); + _build_side_ignore_null |= !_shared_state->store_null_in_hash_table[i]; } } } @@ -484,6 +492,7 @@ Status HashJoinBuildSinkOperatorX::sink(RuntimeState* state, vectorized::Block* state, local_state._shared_state->build_block.get(), &local_state, use_global_rf)); RETURN_IF_ERROR( local_state.process_build_block(state, (*local_state._shared_state->build_block))); + local_state._shared_state->build_indexes_null = local_state._build_indexes_null; if (_shared_hashtable_controller) { _shared_hash_table_context->status = Status::OK(); // arena will be shared with other instances. @@ -497,6 +506,8 @@ Status HashJoinBuildSinkOperatorX::sink(RuntimeState* state, vectorized::Block* _shared_hash_table_context); } _shared_hash_table_context->block = local_state._shared_state->build_block; + _shared_hash_table_context->build_indexes_null = + local_state._shared_state->build_indexes_null; _shared_hashtable_controller->signal(node_id()); } } else if (!local_state._should_build_hash_table) { @@ -527,6 +538,9 @@ Status HashJoinBuildSinkOperatorX::sink(RuntimeState* state, vectorized::Block* _shared_hash_table_context->hash_table_variants)); local_state._shared_state->build_block = _shared_hash_table_context->block; + local_state._build_indexes_null = _shared_hash_table_context->build_indexes_null; + local_state._shared_state->build_indexes_null = + _shared_hash_table_context->build_indexes_null; const bool use_global_rf = local_state._parent->cast()._use_global_rf; @@ -561,10 +575,11 @@ Status HashJoinBuildSinkOperatorX::sink(RuntimeState* state, vectorized::Block* local_state.init_short_circuit_for_probe(); if (source_state == SourceState::FINISHED) { - // Since the comparison of null values is meaningless, null aware left anti join should not output null + // Since the comparison of null values is meaningless, null aware left anti/semi join should not output null // when the build side is not empty. if (local_state._shared_state->build_block && - _join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN) { + (_join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN || + _join_op == TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN)) { local_state._shared_state->probe_ignore_null = true; } local_state._dependency->set_ready_to_read(); diff --git a/be/src/pipeline/exec/hashjoin_build_sink.h b/be/src/pipeline/exec/hashjoin_build_sink.h index 8420719330..f70d9e67bc 100644 --- a/be/src/pipeline/exec/hashjoin_build_sink.h +++ b/be/src/pipeline/exec/hashjoin_build_sink.h @@ -113,6 +113,13 @@ protected: std::shared_ptr _shared_hash_table_dependency; std::vector _build_col_ids; + /* + * For null aware anti/semi join with other join conjuncts, we do need to care about the rows in + * build side with null keys, + * because the other join conjuncts' result may be changed from null to false(null & false == false). + */ + std::shared_ptr> _build_indexes_null; + RuntimeProfile::Counter* _build_table_timer = nullptr; RuntimeProfile::Counter* _build_expr_call_timer = nullptr; RuntimeProfile::Counter* _build_table_insert_timer = nullptr; diff --git a/be/src/pipeline/exec/hashjoin_probe_operator.cpp b/be/src/pipeline/exec/hashjoin_probe_operator.cpp index c7029614c5..aef2e011fa 100644 --- a/be/src/pipeline/exec/hashjoin_probe_operator.cpp +++ b/be/src/pipeline/exec/hashjoin_probe_operator.cpp @@ -44,6 +44,12 @@ Status HashJoinProbeLocalState::init(RuntimeState* state, LocalStateInfo& info) for (size_t i = 0; i < _other_join_conjuncts.size(); i++) { RETURN_IF_ERROR(p._other_join_conjuncts[i]->clone(state, _other_join_conjuncts[i])); } + + _mark_join_conjuncts.resize(p._mark_join_conjuncts.size()); + for (size_t i = 0; i < _mark_join_conjuncts.size(); i++) { + RETURN_IF_ERROR(p._mark_join_conjuncts[i]->clone(state, _mark_join_conjuncts[i])); + } + _construct_mutable_join_block(); _probe_column_disguise_null.reserve(_probe_expr_ctxs.size()); _probe_arena_memory_usage = @@ -83,6 +89,7 @@ void HashJoinProbeLocalState::prepare_for_next() { _build_index = 0; _ready_probe = false; _last_probe_match = -1; + _last_probe_null_mark = -1; _prepare_probe_block(); } @@ -234,42 +241,6 @@ Status HashJoinProbeOperatorX::pull(doris::RuntimeState* state, vectorized::Bloc source_state = SourceState::FINISHED; return Status::OK(); } - if (local_state._shared_state->_has_null_in_build_side && - _short_circuit_for_null_in_build_side && _is_mark_join) { - /// `_has_null_in_build_side` means have null value in build side. - /// `_short_circuit_for_null_in_build_side` means short circuit if has null in build side(e.g. null aware left anti join). - /// We need to create a column as mark with all rows set to NULL. - auto block_rows = local_state._probe_block.rows(); - if (block_rows == 0) { - if (local_state._probe_eos) { - source_state = SourceState::FINISHED; - } - return Status::OK(); - } - - vectorized::Block temp_block; - //get probe side output column - for (int i = 0; i < _left_output_slot_flags.size(); ++i) { - temp_block.insert(local_state._probe_block.get_by_position(i)); - } - auto mark_column = - vectorized::ColumnNullable::create(vectorized::ColumnUInt8::create(block_rows, 0), - vectorized::ColumnUInt8::create(block_rows, 1)); - temp_block.insert({std::move(mark_column), - make_nullable(std::make_shared()), ""}); - - { - SCOPED_TIMER(local_state._join_filter_timer); - RETURN_IF_ERROR(vectorized::VExprContext::filter_block( - local_state._conjuncts, &temp_block, temp_block.columns())); - } - - RETURN_IF_ERROR(local_state._build_output_block(&temp_block, output_block, false)); - temp_block.clear(); - local_state._probe_block.clear_column_data(_child_x->row_desc().num_materialized_slots()); - local_state.reached_limit(output_block, source_state); - return Status::OK(); - } //TODO: this short circuit maybe could refactor, no need to check at here. if (local_state._shared_state->empty_right_table_need_probe_dispose) { @@ -330,6 +301,7 @@ Status HashJoinProbeOperatorX::pull(doris::RuntimeState* state, vectorized::Bloc Status st; if (local_state._probe_index < local_state._probe_block.rows()) { + local_state._build_indexes_null = local_state._shared_state->build_indexes_null; DCHECK(local_state._has_set_need_null_map_for_probe); RETURN_IF_CATCH_EXCEPTION({ std::visit( @@ -540,7 +512,8 @@ Status HashJoinProbeOperatorX::init(const TPlanNode& tnode, RuntimeState* state) DCHECK(tnode.__isset.hash_join_node); const bool probe_dispose_null = _match_all_probe || _build_unique || _join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN || - _join_op == TJoinOp::LEFT_ANTI_JOIN || _join_op == TJoinOp::LEFT_SEMI_JOIN; + _join_op == TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN || _join_op == TJoinOp::LEFT_ANTI_JOIN || + _join_op == TJoinOp::LEFT_SEMI_JOIN; const std::vector& eq_join_conjuncts = tnode.hash_join_node.eq_join_conjuncts; std::vector probe_not_ignore_null(eq_join_conjuncts.size()); size_t conjuncts_index = 0; @@ -575,6 +548,20 @@ Status HashJoinProbeOperatorX::init(const TPlanNode& tnode, RuntimeState* state) DCHECK(!_build_unique); DCHECK(_have_other_join_conjunct); } + + if (tnode.hash_join_node.__isset.mark_join_conjuncts && + !tnode.hash_join_node.mark_join_conjuncts.empty()) { + RETURN_IF_ERROR(vectorized::VExpr::create_expr_trees( + tnode.hash_join_node.mark_join_conjuncts, _mark_join_conjuncts)); + DCHECK(_is_mark_join); + + /// We make mark join conjuncts as equal conjuncts for null aware join, + /// so `_mark_join_conjuncts` should be empty if this is null aware join. + DCHECK_EQ(_mark_join_conjuncts.empty(), + _join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN || + _join_op == TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN); + } + return Status::OK(); } @@ -603,6 +590,11 @@ Status HashJoinProbeOperatorX::prepare(RuntimeState* state) { for (auto& conjunct : _other_join_conjuncts) { RETURN_IF_ERROR(conjunct->prepare(state, *_intermediate_row_desc)); } + + for (auto& conjunct : _mark_join_conjuncts) { + RETURN_IF_ERROR(conjunct->prepare(state, *_intermediate_row_desc)); + } + RETURN_IF_ERROR(vectorized::VExpr::prepare(_probe_expr_ctxs, state, _child_x->row_desc())); DCHECK(_build_side_child != nullptr); // right table data types @@ -621,6 +613,11 @@ Status HashJoinProbeOperatorX::open(RuntimeState* state) { for (auto& conjunct : _other_join_conjuncts) { RETURN_IF_ERROR(conjunct->open(state)); } + + for (auto& conjunct : _mark_join_conjuncts) { + RETURN_IF_ERROR(conjunct->open(state)); + } + return Status::OK(); } diff --git a/be/src/pipeline/exec/hashjoin_probe_operator.h b/be/src/pipeline/exec/hashjoin_probe_operator.h index 18db6acc67..1bdb9864c4 100644 --- a/be/src/pipeline/exec/hashjoin_probe_operator.h +++ b/be/src/pipeline/exec/hashjoin_probe_operator.h @@ -59,6 +59,8 @@ using HashTableCtxVariants = std::variant< vectorized::ProcessHashTableProbe, vectorized::ProcessHashTableProbe, vectorized::ProcessHashTableProbe, + vectorized::ProcessHashTableProbe>; class HashJoinProbeDependency final : public Dependency { @@ -120,10 +122,23 @@ private: std::atomic _probe_inited = false; int _last_probe_match; + // For mark join, last probe index of null mark + int _last_probe_null_mark; + + /* + * For null aware anti/semi join with other join conjuncts, we do need to care about the rows in + * build side with null keys, + * because the other join conjuncts' result may be changed from null to false(null & false == false). + */ + std::shared_ptr> _build_indexes_null; + vectorized::Block _probe_block; vectorized::ColumnRawPtrs _probe_columns; // other expr vectorized::VExprContextSPtrs _other_join_conjuncts; + + vectorized::VExprContextSPtrs _mark_join_conjuncts; + // probe expr vectorized::VExprContextSPtrs _probe_expr_ctxs; std::vector _probe_column_disguise_null; @@ -188,6 +203,9 @@ private: const bool _is_broadcast_join; // other expr vectorized::VExprContextSPtrs _other_join_conjuncts; + + vectorized::VExprContextSPtrs _mark_join_conjuncts; + // probe expr vectorized::VExprContextSPtrs _probe_expr_ctxs; bool _probe_ignore_null = false; diff --git a/be/src/pipeline/exec/join_build_sink_operator.cpp b/be/src/pipeline/exec/join_build_sink_operator.cpp index 798b5d86b0..73ebfb947a 100644 --- a/be/src/pipeline/exec/join_build_sink_operator.cpp +++ b/be/src/pipeline/exec/join_build_sink_operator.cpp @@ -68,7 +68,8 @@ JoinBuildSinkOperatorX::JoinBuildSinkOperatorX(ObjectPool* pool, _join_op == TJoinOp::RIGHT_SEMI_JOIN), _is_left_semi_anti(_join_op == TJoinOp::LEFT_ANTI_JOIN || _join_op == TJoinOp::LEFT_SEMI_JOIN || - _join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN), + _join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN || + _join_op == TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN), _is_outer_join(_match_all_build || _match_all_probe), _is_mark_join(tnode.__isset.nested_loop_join_node ? (tnode.nested_loop_join_node.__isset.is_mark @@ -76,11 +77,13 @@ JoinBuildSinkOperatorX::JoinBuildSinkOperatorX(ObjectPool* pool, : false) : tnode.hash_join_node.__isset.is_mark ? tnode.hash_join_node.is_mark : false), - _short_circuit_for_null_in_build_side(_join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN) { + _short_circuit_for_null_in_build_side(_join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN && + !_is_mark_join) { _init_join_op(); if (_is_mark_join) { DCHECK(_join_op == TJoinOp::LEFT_ANTI_JOIN || _join_op == TJoinOp::LEFT_SEMI_JOIN || - _join_op == TJoinOp::CROSS_JOIN || _join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN) + _join_op == TJoinOp::CROSS_JOIN || _join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN || + _join_op == TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN) << "Mark join is only supported for null aware left semi/anti join and cross join " "but this is " << _join_op; @@ -97,7 +100,8 @@ JoinBuildSinkOperatorX::JoinBuildSinkOperatorX(ObjectPool* pool, M(CROSS_JOIN) \ M(RIGHT_SEMI_JOIN) \ M(RIGHT_ANTI_JOIN) \ - M(NULL_AWARE_LEFT_ANTI_JOIN) + M(NULL_AWARE_LEFT_ANTI_JOIN) \ + M(NULL_AWARE_LEFT_SEMI_JOIN) template void JoinBuildSinkOperatorX::_init_join_op() { diff --git a/be/src/pipeline/exec/join_probe_operator.cpp b/be/src/pipeline/exec/join_probe_operator.cpp index 4e94b4a206..3e71ec41dd 100644 --- a/be/src/pipeline/exec/join_probe_operator.cpp +++ b/be/src/pipeline/exec/join_probe_operator.cpp @@ -65,9 +65,15 @@ void JoinProbeLocalState::_construct_mutable_join_block _join_block.insert({type_ptr->create_column(), type_ptr, slot_desc->col_name()}); } } + if (p._is_mark_join) { - DCHECK(!p._is_mark_join || - _join_block.get_by_position(_join_block.columns() - 1).column->is_nullable()); + _mark_column_id = _join_block.columns() - 1; +#ifndef NDEBUG + const auto& mark_column = assert_cast( + *_join_block.get_by_position(_mark_column_id).column); + auto& nested_column = mark_column.get_nested_column(); + DCHECK(check_and_get_column(nested_column) != nullptr); +#endif } } @@ -182,7 +188,8 @@ JoinProbeOperatorX::JoinProbeOperatorX(ObjectPool* pool, const T : false) : tnode.hash_join_node.__isset.is_mark ? tnode.hash_join_node.is_mark : false), - _short_circuit_for_null_in_build_side(_join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN) { + _short_circuit_for_null_in_build_side(_join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN && + !_is_mark_join) { if (tnode.__isset.hash_join_node) { _intermediate_row_desc.reset(new RowDescriptor( descs, tnode.hash_join_node.vintermediate_tuple_id_list, diff --git a/be/src/pipeline/exec/join_probe_operator.h b/be/src/pipeline/exec/join_probe_operator.h index 9c6280bb64..5dc6089c34 100644 --- a/be/src/pipeline/exec/join_probe_operator.h +++ b/be/src/pipeline/exec/join_probe_operator.h @@ -53,6 +53,8 @@ protected: vectorized::MutableColumnPtr _tuple_is_null_left_flag_column = nullptr; vectorized::MutableColumnPtr _tuple_is_null_right_flag_column = nullptr; + size_t _mark_column_id = -1; + RuntimeProfile::Counter* _probe_timer = nullptr; RuntimeProfile::Counter* _probe_rows_counter = nullptr; RuntimeProfile::Counter* _join_filter_timer = nullptr; diff --git a/be/src/pipeline/pipeline_x/dependency.h b/be/src/pipeline/pipeline_x/dependency.h index 8a58973be3..5ae3480cd4 100644 --- a/be/src/pipeline/pipeline_x/dependency.h +++ b/be/src/pipeline/pipeline_x/dependency.h @@ -461,6 +461,7 @@ struct HashJoinSharedState : public JoinSharedState { const std::vector build_side_child_desc; size_t build_exprs_size = 0; std::shared_ptr build_block; + std::shared_ptr> build_indexes_null; bool probe_ignore_null = false; }; diff --git a/be/src/vec/common/hash_table/hash_map.h b/be/src/vec/common/hash_table/hash_map.h index d10b24ade2..0bc7a5abbf 100644 --- a/be/src/vec/common/hash_table/hash_map.h +++ b/be/src/vec/common/hash_table/hash_map.h @@ -216,4 +216,4 @@ using HashMapWithStackMemory = HashMapTable< HashTableAllocatorWithStackMemory<(1ULL << initial_size_degree) * sizeof(HashMapCellWithSavedHash)>>; -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/src/vec/common/hash_table/join_hash_table.h b/be/src/vec/common/hash_table/join_hash_table.h index b190d3d89c..08311989b5 100644 --- a/be/src/vec/common/hash_table/join_hash_table.h +++ b/be/src/vec/common/hash_table/join_hash_table.h @@ -92,14 +92,9 @@ public: } } - if constexpr (with_other_conjuncts) { - return _find_batch_conjunct(keys, build_idx_map, probe_idx, build_idx, - probe_rows, probe_idxs, build_idxs); - } - - if constexpr (is_mark_join) { - return _find_batch_mark(keys, build_idx_map, probe_idx, probe_rows, - probe_idxs, build_idxs, mark_column); + if constexpr (with_other_conjuncts || is_mark_join) { + return _find_batch_conjunct( + keys, build_idx_map, probe_idx, build_idx, probe_rows, probe_idxs, build_idxs); } if constexpr (JoinOpType == TJoinOp::INNER_JOIN || JoinOpType == TJoinOp::FULL_OUTER_JOIN || @@ -122,6 +117,80 @@ public: return std::tuple {0, 0U, 0}; } + /** + * Because the equality comparison result of null with any value is null, + * in null aware join, if the probe key of a row in the left table(probe side) is null, + * then this row will match all rows on the right table(build side) (the match result is null). + * If the probe key of a row in the left table does not match any row in right table, + * this row will match all rows with null key in the right table. + * select 'a' in ('b', null) => 'a' = 'b' or 'a' = null => false or null => null + * select 'a' in ('a', 'b', null) => true + * select 'a' not in ('b', null) => null => 'a' != 'b' and 'a' != null => true and null => null + * select 'a' not in ('a', 'b', null) => false + */ + auto find_null_aware_with_other_conjuncts( + const Key* __restrict keys, const uint32_t* __restrict build_idx_map, int probe_idx, + uint32_t build_idx, int probe_rows, uint32_t* __restrict probe_idxs, + uint32_t* __restrict build_idxs, std::set& null_result, + const std::vector& build_indexes_null, const size_t build_block_count) { + auto matched_cnt = 0; + const auto batch_size = max_batch_size; + + bool has_matched = false; + auto do_the_probe = [&]() { + while (build_idx && matched_cnt < batch_size) { + if (build_idx == bucket_size) { + /// All rows in build side should be executed with other join conjuncts. + for (size_t i = 1; i != build_block_count; ++i) { + build_idxs[matched_cnt] = i; + probe_idxs[matched_cnt] = probe_idx; + matched_cnt++; + } + null_result.emplace(probe_idx); + build_idx = 0; + has_matched = true; + break; + } else if (keys[probe_idx] == build_keys[build_idx]) { + build_idxs[matched_cnt] = build_idx; + probe_idxs[matched_cnt] = probe_idx; + matched_cnt++; + has_matched = true; + } + + build_idx = next[build_idx]; + } + + // may over batch_size when emplace 0 into build_idxs + if (!build_idx) { + if (!has_matched) { // has no any row matched + for (auto index : build_indexes_null) { + build_idxs[matched_cnt] = index; + probe_idxs[matched_cnt] = probe_idx; + matched_cnt++; + } + } + probe_idxs[matched_cnt] = probe_idx; + build_idxs[matched_cnt] = 0; + matched_cnt++; + has_matched = false; + } + + probe_idx++; + }; + + if (build_idx) { + do_the_probe(); + } + + while (probe_idx < probe_rows && matched_cnt < batch_size) { + build_idx = build_idx_map[probe_idx]; + do_the_probe(); + } + + probe_idx -= (build_idx != 0); + return std::tuple {probe_idx, build_idx, matched_cnt}; + } + template bool iterate_map(std::vector& build_idxs) const { const auto batch_size = max_batch_size; @@ -157,42 +226,6 @@ public: } private: - // only LEFT_ANTI_JOIN/LEFT_SEMI_JOIN/NULL_AWARE_LEFT_ANTI_JOIN/CROSS_JOIN support mark join - template - auto _find_batch_mark(const Key* __restrict keys, const uint32_t* __restrict build_idx_map, - int probe_idx, int probe_rows, uint32_t* __restrict probe_idxs, - uint32_t* __restrict build_idxs, - vectorized::ColumnFilterHelper* mark_column) { - auto matched_cnt = 0; - const auto batch_size = max_batch_size; - - while (probe_idx < probe_rows && matched_cnt < batch_size) { - auto build_idx = build_idx_map[probe_idx] == bucket_size ? 0 : build_idx_map[probe_idx]; - - while (build_idx && keys[probe_idx] != build_keys[build_idx]) { - build_idx = next[build_idx]; - } - - if (build_idx_map[probe_idx] == bucket_size) { - // mark result as null when probe row is null - mark_column->insert_null(); - } else { - bool matched = - JoinOpType == TJoinOp::LEFT_SEMI_JOIN ? build_idx != 0 : build_idx == 0; - if (!matched && _has_null_key) { - mark_column->insert_null(); - } else { - mark_column->insert_value(matched); - } - } - - probe_idxs[matched_cnt] = probe_idx++; - build_idxs[matched_cnt] = build_idx; - matched_cnt++; - } - return std::tuple {probe_idx, 0U, matched_cnt}; - } - template auto _process_null_aware_left_anti_join_for_empty_build_side( int probe_idx, int probe_rows, uint32_t* __restrict probe_idxs, @@ -203,14 +236,13 @@ private: while (probe_idx < probe_rows && matched_cnt < batch_size) { probe_idxs[matched_cnt] = probe_idx++; - if constexpr (is_mark_join) { - build_idxs[matched_cnt] = 0; - } + build_idxs[matched_cnt] = 0; ++matched_cnt; } if constexpr (is_mark_join && !with_other_conjuncts) { - mark_column->resize_fill(matched_cnt, 1); + // we will flip the mark column later for anti join, so here set 0 into mark column. + mark_column->resize_fill(matched_cnt, 0); } return std::tuple {probe_idx, 0U, matched_cnt}; @@ -260,7 +292,7 @@ private: return std::tuple {probe_idx, 0U, matched_cnt}; } - template + template auto _find_batch_conjunct(const Key* __restrict keys, const uint32_t* __restrict build_idx_map, int probe_idx, uint32_t build_idx, int probe_rows, uint32_t* __restrict probe_idxs, uint32_t* __restrict build_idxs) { @@ -276,7 +308,17 @@ private: build_idxs[matched_cnt] = build_idx; matched_cnt++; } - } else if (keys[probe_idx] == build_keys[build_idx]) { + } else if constexpr (need_judge_null) { + if (build_idx == bucket_size) { + build_idxs[matched_cnt] = build_idx; + probe_idxs[matched_cnt] = probe_idx; + build_idx = 0; + matched_cnt++; + break; + } + } + + if (keys[probe_idx] == build_keys[build_idx]) { build_idxs[matched_cnt] = build_idx; probe_idxs[matched_cnt] = probe_idx; matched_cnt++; @@ -288,7 +330,8 @@ private: JoinOpType == TJoinOp::FULL_OUTER_JOIN || JoinOpType == TJoinOp::LEFT_SEMI_JOIN || JoinOpType == TJoinOp::LEFT_ANTI_JOIN || - JoinOpType == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN) { + JoinOpType == doris::TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN || + JoinOpType == doris::TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN) { // may over batch_size when emplace 0 into build_idxs if (!build_idx) { probe_idxs[matched_cnt] = probe_idx; diff --git a/be/src/vec/exec/join/null_aware_left_semi_join_impl.cpp b/be/src/vec/exec/join/null_aware_left_semi_join_impl.cpp new file mode 100644 index 0000000000..98d39e6147 --- /dev/null +++ b/be/src/vec/exec/join/null_aware_left_semi_join_impl.cpp @@ -0,0 +1,26 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "process_hash_table_probe_impl.h" + +namespace doris::vectorized { + +INSTANTIATION_FOR(TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN); + +} diff --git a/be/src/vec/exec/join/process_hash_table_probe.h b/be/src/vec/exec/join/process_hash_table_probe.h index 803cc34bd7..02bf242e55 100644 --- a/be/src/vec/exec/join/process_hash_table_probe.h +++ b/be/src/vec/exec/join/process_hash_table_probe.h @@ -44,7 +44,7 @@ struct ProcessHashTableProbe { // output build side result column void build_side_output_column(MutableColumns& mcol, const std::vector& output_slot_flags, - int size, bool have_other_join_conjunct); + int size, bool have_other_join_conjunct, bool is_mark_join); void probe_side_output_column(MutableColumns& mcol, const std::vector& output_slot_flags, int size, int last_probe_index, bool all_match_one, @@ -70,6 +70,10 @@ struct ProcessHashTableProbe { Status do_other_join_conjuncts(Block* output_block, bool is_mark_join, std::vector& visited, bool has_null_in_build_side); + template + Status do_mark_join_conjuncts(Block* output_block, size_t hash_table_bucket_size, + const std::set& null_result); + template typename HashTableType::State _init_probe_side(HashTableType& hash_table_ctx, size_t probe_rows, bool with_other_join_conjuncts, diff --git a/be/src/vec/exec/join/process_hash_table_probe_impl.h b/be/src/vec/exec/join/process_hash_table_probe_impl.h index c9f92c59e0..fd2ac01613 100644 --- a/be/src/vec/exec/join/process_hash_table_probe_impl.h +++ b/be/src/vec/exec/join/process_hash_table_probe_impl.h @@ -62,7 +62,7 @@ ProcessHashTableProbe::ProcessHashTableProbe(Parent* parent, template void ProcessHashTableProbe::build_side_output_column( MutableColumns& mcol, const std::vector& output_slot_flags, int size, - bool have_other_join_conjunct) { + bool have_other_join_conjunct, bool is_mark_join) { SCOPED_TIMER(_build_side_output_timer); constexpr auto is_semi_anti_join = JoinOpType == TJoinOp::RIGHT_ANTI_JOIN || JoinOpType == TJoinOp::RIGHT_SEMI_JOIN || @@ -73,7 +73,9 @@ void ProcessHashTableProbe::build_side_output_column( constexpr auto probe_all = JoinOpType == TJoinOp::LEFT_OUTER_JOIN || JoinOpType == TJoinOp::FULL_OUTER_JOIN; - if ((!is_semi_anti_join || have_other_join_conjunct) && size) { + if ((!is_semi_anti_join || have_other_join_conjunct || + (is_mark_join && !_parent->_mark_join_conjuncts.empty())) && + size) { for (int i = 0; i < _right_col_len; i++) { const auto& column = *_build_block->safe_get_by_position(i).column; if (output_slot_flags[i]) { @@ -177,7 +179,21 @@ Status ProcessHashTableProbe::do_process(HashTableType& hash mark_column = std::make_unique(*mcol[mcol.size() - 1]); } - { + /// `null_result` set which contains the probe indexes of null results. + std::set null_result; + if constexpr ((JoinOpType == doris::TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN || + JoinOpType == doris::TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN) && + with_other_conjuncts) { + SCOPED_TIMER(_search_hashtable_timer); + auto [new_probe_idx, new_build_idx, new_current_offset] = + hash_table_ctx.hash_table->find_null_aware_with_other_conjuncts( + hash_table_ctx.keys, hash_table_ctx.bucket_nums.data(), probe_index, + build_index, probe_rows, _probe_indexs.data(), _build_indexs.data(), + null_result, *(_parent->_build_indexes_null), _build_block->rows()); + probe_index = new_probe_idx; + build_index = new_build_idx; + current_offset = new_current_offset; + } else { SCOPED_TIMER(_search_hashtable_timer); auto [new_probe_idx, new_build_idx, new_current_offset] = hash_table_ctx.hash_table->template find_batch < JoinOpType, @@ -191,7 +207,8 @@ Status ProcessHashTableProbe::do_process(HashTableType& hash current_offset = new_current_offset; } - build_side_output_column(mcol, *_right_output_slot_flags, current_offset, with_other_conjuncts); + build_side_output_column(mcol, *_right_output_slot_flags, current_offset, with_other_conjuncts, + is_mark_join); if constexpr (with_other_conjuncts || (JoinOpType != TJoinOp::RIGHT_SEMI_JOIN && JoinOpType != TJoinOp::RIGHT_ANTI_JOIN)) { @@ -214,7 +231,10 @@ Status ProcessHashTableProbe::do_process(HashTableType& hash output_block->swap(mutable_block.to_block()); - if constexpr (with_other_conjuncts) { + if constexpr (is_mark_join) { + return do_mark_join_conjuncts( + output_block, hash_table_ctx.hash_table->get_bucket_size(), null_result); + } else if constexpr (with_other_conjuncts) { return do_other_join_conjuncts(output_block, is_mark_join, hash_table_ctx.hash_table->get_visited(), hash_table_ctx.hash_table->has_null_key()); @@ -223,6 +243,131 @@ Status ProcessHashTableProbe::do_process(HashTableType& hash return Status::OK(); } +template +template +Status ProcessHashTableProbe::do_mark_join_conjuncts( + Block* output_block, size_t hash_table_bucket_size, const std::set& null_result) { + DCHECK(JoinOpType == TJoinOp::LEFT_ANTI_JOIN || + JoinOpType == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN || + JoinOpType == TJoinOp::LEFT_SEMI_JOIN || + JoinOpType == TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN); + + constexpr bool is_anti_join = JoinOpType == TJoinOp::LEFT_ANTI_JOIN || + JoinOpType == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN; + constexpr bool is_null_aware_join = JoinOpType == TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN || + JoinOpType == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN; + + const auto row_count = output_block->rows(); + auto mark_column_mutable = + output_block->get_by_position(_parent->_mark_column_id).column->assume_mutable(); + auto& mark_column = assert_cast(*mark_column_mutable); + IColumn::Filter& filter = assert_cast(mark_column.get_nested_column()).get_data(); + + if (_parent->_mark_join_conjuncts.empty()) { + // For null aware anti/semi join, if the equal conjuncts was not matched and the build side has null value, + // the result should be null. Like: + // select 4 not in (2, 3, null) => null, select 4 not in (2, 3) => true + // select 4 in (2, 3, null) => null, select 4 in (2, 3) => false + const bool should_be_null_if_build_side_has_null = *_has_null_in_build_side; + + mark_column.resize(row_count); + auto* filter_data = + assert_cast(mark_column.get_nested_column()).get_data().data(); + auto* mark_null_map = mark_column.get_null_map_data().data(); + int last_probe_matched = -1; + for (size_t i = 0; i != row_count; ++i) { + filter_data[i] = _build_indexs[i] != 0 && _build_indexs[i] != hash_table_bucket_size; + if constexpr (is_null_aware_join) { + if constexpr (with_other_conjuncts) { + mark_null_map[i] = + null_result.contains(_probe_indexs[i]) && _build_indexs[i] != 0; + } else { + if (filter_data[i]) { + last_probe_matched = _probe_indexs[i]; + mark_null_map[i] = false; + } else if (_build_indexs[i] == 0) { + mark_null_map[i] = should_be_null_if_build_side_has_null && + last_probe_matched != _probe_indexs[i]; + } else if (_build_indexs[i] == hash_table_bucket_size) { + mark_null_map[i] = true; + } + } + } + } + if constexpr (!is_null_aware_join) { + memset(mark_null_map, 0, row_count); + } + } else { + RETURN_IF_ERROR(VExprContext::execute_conjuncts(_parent->_mark_join_conjuncts, output_block, + mark_column.get_null_map_column(), filter)); + } + auto* mark_null_map = mark_column.get_null_map_data().data(); + + auto* mark_filter_data = filter.data(); + + if constexpr (with_other_conjuncts) { + IColumn::Filter other_conjunct_filter(row_count, 1); + { + bool can_be_filter_all = false; + RETURN_IF_ERROR(VExprContext::execute_conjuncts(_parent->_other_join_conjuncts, nullptr, + output_block, &other_conjunct_filter, + &can_be_filter_all)); + } + DCHECK_EQ(filter.size(), other_conjunct_filter.size()); + const auto* other_filter_data = other_conjunct_filter.data(); + for (size_t i = 0; i != filter.size(); ++i) { + // null & any(true or false) => null => false + mark_filter_data[i] &= (!mark_null_map[i]) & other_filter_data[i]; + + // null & true => null + // null & false => false + mark_null_map[i] &= other_filter_data[i]; + } + } + + auto filter_column = ColumnUInt8::create(row_count, 0); + auto* __restrict filter_map = filter_column->get_data().data(); + + /** + * Here need `!with_other_conjuncts` be true, + * because null aware join with other join conjuncts will process the `mark_null_map` after the + * other join conjuncts are executed. + */ + const bool should_be_null_if_build_side_has_null = + *_has_null_in_build_side && is_null_aware_join && !with_other_conjuncts; + for (size_t i = 0; i != row_count; ++i) { + bool not_matched_before = _parent->_last_probe_match != _probe_indexs[i]; + if (_build_indexs[i] == 0) { + bool has_null_mark_value = _parent->_last_probe_null_mark == _probe_indexs[i]; + if (not_matched_before) { + filter_map[i] = true; + mark_null_map[i] = has_null_mark_value || should_be_null_if_build_side_has_null; + mark_filter_data[i] = false; + } + } else { + if (mark_null_map[i]) { // is null + _parent->_last_probe_null_mark = _probe_indexs[i]; + } else { + if (mark_filter_data[i] && not_matched_before) { + _parent->_last_probe_match = _probe_indexs[i]; + filter_map[i] = true; + } + } + } + } + + if constexpr (is_anti_join) { + // flip the mark column + for (size_t i = 0; i != row_count; ++i) { + mark_filter_data[i] ^= 1; + } + } + + auto result_column_id = output_block->columns(); + output_block->insert({std::move(filter_column), std::make_shared(), ""}); + return Block::filter_block(output_block, result_column_id, result_column_id); +} + template Status ProcessHashTableProbe::do_other_join_conjuncts( Block* output_block, bool is_mark_join, std::vector& visited, diff --git a/be/src/vec/exec/join/vhash_join_node.cpp b/be/src/vec/exec/join/vhash_join_node.cpp index c7b6b5d041..271e522cd6 100644 --- a/be/src/vec/exec/join/vhash_join_node.cpp +++ b/be/src/vec/exec/join/vhash_join_node.cpp @@ -107,7 +107,8 @@ Status HashJoinNode::init(const TPlanNode& tnode, RuntimeState* state) { _join_op == TJoinOp::RIGHT_ANTI_JOIN; const bool probe_dispose_null = _match_all_probe || _build_unique || _join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN || - _join_op == TJoinOp::LEFT_ANTI_JOIN || _join_op == TJoinOp::LEFT_SEMI_JOIN; + _join_op == TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN || _join_op == TJoinOp::LEFT_ANTI_JOIN || + _join_op == TJoinOp::LEFT_SEMI_JOIN; const std::vector& eq_join_conjuncts = tnode.hash_join_node.eq_join_conjuncts; std::vector probe_not_ignore_null(eq_join_conjuncts.size()); @@ -156,6 +157,35 @@ Status HashJoinNode::init(const TPlanNode& tnode, RuntimeState* state) { DCHECK(_have_other_join_conjunct); } + if (tnode.hash_join_node.__isset.mark_join_conjuncts) { + RETURN_IF_ERROR(VExpr::create_expr_trees(tnode.hash_join_node.mark_join_conjuncts, + _mark_join_conjuncts)); + DCHECK(_is_mark_join); + + /// We make mark join conjuncts as equal conjuncts for null aware join, + /// so `_mark_join_conjuncts` should be empty if this is null aware join. + DCHECK_EQ(_mark_join_conjuncts.empty(), + _join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN || + _join_op == TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN); + } + +#ifndef NDEBUG + /// mark join should be half join + if (_is_mark_join) { + DCHECK(_join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN || + _join_op == TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN || + _join_op == TJoinOp::LEFT_ANTI_JOIN || _join_op == TJoinOp::LEFT_SEMI_JOIN || + _join_op == TJoinOp::RIGHT_ANTI_JOIN || _join_op == TJoinOp::RIGHT_SEMI_JOIN) + << "join(op: " << _join_op << ") should not be mark join"; + } +#endif + + if ((_join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN || + _join_op == TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN) && + _have_other_join_conjunct) { + _build_indexes_null = std::make_shared>(); + } + _runtime_filters.resize(_runtime_filter_descs.size()); for (size_t i = 0; i < _runtime_filter_descs.size(); i++) { RETURN_IF_ERROR(state->runtime_filter_mgr()->register_producer_filter( @@ -250,6 +280,11 @@ Status HashJoinNode::prepare(RuntimeState* state) { for (auto& conjunct : _other_join_conjuncts) { RETURN_IF_ERROR(conjunct->prepare(state, *_intermediate_row_desc)); } + + for (auto& conjunct : _mark_join_conjuncts) { + RETURN_IF_ERROR(conjunct->prepare(state, *_intermediate_row_desc)); + } + RETURN_IF_ERROR(VExpr::prepare(_output_expr_ctxs, state, *_intermediate_row_desc)); // right table data types @@ -300,6 +335,7 @@ void HashJoinNode::prepare_for_next() { _build_index = 0; _ready_probe = false; _last_probe_match = -1; + _last_probe_null_mark = -1; _prepare_probe_block(); } @@ -312,39 +348,6 @@ Status HashJoinNode::pull(doris::RuntimeState* state, vectorized::Block* output_ return Status::OK(); } - /// `_has_null_in_build_side` means have null value in build side. - /// `_short_circuit_for_null_in_build_side` means short circuit if has null in build side(e.g. null aware left anti join). - if (_has_null_in_build_side && _short_circuit_for_null_in_build_side && _is_mark_join) { - /// We need to create a column as mark with all rows set to NULL. - auto block_rows = _probe_block.rows(); - if (block_rows == 0) { - *eos = _probe_eos; - return Status::OK(); - } - - Block temp_block; - //get probe side output column - for (int i = 0; i < _left_output_slot_flags.size(); ++i) { - temp_block.insert(_probe_block.get_by_position(i)); - } - auto mark_column = ColumnNullable::create(ColumnUInt8::create(block_rows, 0), - ColumnUInt8::create(block_rows, 1)); - temp_block.insert( - {std::move(mark_column), make_nullable(std::make_shared()), ""}); - - { - SCOPED_TIMER(_join_filter_timer); - RETURN_IF_ERROR( - VExprContext::filter_block(_conjuncts, &temp_block, temp_block.columns())); - } - - RETURN_IF_ERROR(_build_output_block(&temp_block, output_block, false)); - temp_block.clear(); - release_block_memory(_probe_block); - reached_limit(output_block, eos); - return Status::OK(); - } - //TODO: this short circuit maybe could refactor, no need to check at here. if (_empty_right_table_need_probe_dispose) { // when build table rows is 0 and not have other_join_conjunct and join type is one of LEFT_OUTER_JOIN/FULL_OUTER_JOIN/LEFT_ANTI_JOIN @@ -667,6 +670,10 @@ Status HashJoinNode::alloc_resource(doris::RuntimeState* state) { for (auto& conjunct : _other_join_conjuncts) { RETURN_IF_ERROR(conjunct->open(state)); } + + for (auto& conjunct : _mark_join_conjuncts) { + RETURN_IF_ERROR(conjunct->open(state)); + } return Status::OK(); } @@ -754,6 +761,7 @@ Status HashJoinNode::sink(doris::RuntimeState* state, vectorized::Block* in_bloc // arena will be shared with other instances. _shared_hash_table_context->arena = _arena; _shared_hash_table_context->block = _build_block; + _shared_hash_table_context->build_indexes_null = _build_indexes_null; _shared_hash_table_context->hash_table_variants = _hash_table_variants; _shared_hash_table_context->short_circuit_for_null_in_probe_side = _has_null_in_build_side; @@ -786,6 +794,7 @@ Status HashJoinNode::sink(doris::RuntimeState* state, vectorized::Block* in_bloc *std::static_pointer_cast( _shared_hash_table_context->hash_table_variants)); _build_block = _shared_hash_table_context->block; + _build_indexes_null = _shared_hash_table_context->build_indexes_null; if (!_shared_hash_table_context->runtime_filters.empty()) { auto ret = std::visit( @@ -816,9 +825,10 @@ Status HashJoinNode::sink(doris::RuntimeState* state, vectorized::Block* in_bloc _process_hashtable_ctx_variants_init(state); } - // Since the comparison of null values is meaningless, null aware left anti join should not output null + // Since the comparison of null values is meaningless, null aware left anti/semi join should not output null // when the build side is not empty. - if (_build_block && _join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN) { + if (_build_block && (_join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN || + _join_op == TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN)) { _probe_ignore_null = true; } _init_short_circuit_for_probe(); @@ -917,11 +927,10 @@ bool HashJoinNode::_need_probe_null_map(Block& block, const std::vector& re void HashJoinNode::_set_build_ignore_flag(Block& block, const std::vector& res_col_ids) { DCHECK_EQ(_build_expr_ctxs.size(), _probe_expr_ctxs.size()); for (size_t i = 0; i < _build_expr_ctxs.size(); ++i) { - if (!_is_null_safe_eq_join[i]) { + if (!_is_null_safe_eq_join[i] && !_short_circuit_for_null_in_build_side) { const auto* column = block.get_by_position(res_col_ids[i]).column.get(); if (check_and_get_column(*column)) { - _build_side_ignore_null |= (_join_op != TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN && - !_store_null_in_hash_table[i]); + _build_side_ignore_null |= !_store_null_in_hash_table[i]; } } } @@ -962,30 +971,33 @@ Status HashJoinNode::_process_build_block(RuntimeState* state, Block& block) { st = std::visit( Overload {[&](std::monostate& arg, auto join_op, auto has_null_value, - auto short_circuit_for_null_in_build_side) -> Status { + auto short_circuit_for_null_in_build_side, + auto with_other_conjuncts) -> Status { LOG(FATAL) << "FATAL: uninited hash table"; __builtin_unreachable(); return Status::OK(); }, [&](auto&& arg, auto&& join_op, auto has_null_value, - auto short_circuit_for_null_in_build_side) -> Status { + auto short_circuit_for_null_in_build_side, + auto with_other_conjuncts) -> Status { using HashTableCtxType = std::decay_t; using JoinOpType = std::decay_t; ProcessHashTableBuild hash_table_build_process(rows, block, raw_ptrs, this, state->batch_size(), state); - return hash_table_build_process - .template run( - arg, - has_null_value || short_circuit_for_null_in_build_side - ? &null_map_val->get_data() - : nullptr, - &_has_null_in_build_side); + return hash_table_build_process.template run< + JoinOpType::value, has_null_value, + short_circuit_for_null_in_build_side, with_other_conjuncts>( + arg, + has_null_value || short_circuit_for_null_in_build_side + ? &null_map_val->get_data() + : nullptr, + &_has_null_in_build_side); }}, *_hash_table_variants, _join_op_variants, make_bool_variant(_build_side_ignore_null), - make_bool_variant(_short_circuit_for_null_in_build_side)); + make_bool_variant(_short_circuit_for_null_in_build_side), + make_bool_variant(_have_other_join_conjunct)); return st; } diff --git a/be/src/vec/exec/join/vhash_join_node.h b/be/src/vec/exec/join/vhash_join_node.h index 7fdb103d1f..535db4434d 100644 --- a/be/src/vec/exec/join/vhash_join_node.h +++ b/be/src/vec/exec/join/vhash_join_node.h @@ -110,13 +110,19 @@ struct ProcessHashTableBuild { _batch_size(batch_size), _state(state) {} - template + template Status run(HashTableContext& hash_table_ctx, ConstNullMapPtr null_map, bool* has_null_key) { if (short_circuit_for_null || ignore_null) { // first row is mocked and is null for (uint32_t i = 1; i < _rows; i++) { if ((*null_map)[i]) { *has_null_key = true; + if constexpr (with_other_conjuncts && + (JoinOpType == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN || + JoinOpType == TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN)) { + _parent->_build_indexes_null->emplace_back(i); + } } } if (short_circuit_for_null && *has_null_key) { @@ -201,7 +207,8 @@ using HashTableCtxVariants = ProcessHashTableProbe, ProcessHashTableProbe, ProcessHashTableProbe, - ProcessHashTableProbe>; + ProcessHashTableProbe, + ProcessHashTableProbe>; class HashJoinNode final : public VJoinNodeBase { public: @@ -291,6 +298,9 @@ private: // other expr VExprContextSPtrs _other_join_conjuncts; + // conjuncts for mark join, which result type is ternary boolean(true, false, null) + VExprContextSPtrs _mark_join_conjuncts; + // mark the join column whether support null eq std::vector _is_null_safe_eq_join; @@ -304,6 +314,13 @@ private: std::vector _probe_column_disguise_null; std::vector _probe_column_convert_to_null; + /* + * For null aware anti/semi join with other join conjuncts, we do need to care about the rows in + * build side with null keys, + * because the other join conjuncts' result maybe change null to false(null & false == false). + */ + std::shared_ptr> _build_indexes_null; + DataTypes _right_table_data_types; DataTypes _left_table_data_types; std::vector _right_table_column_names; @@ -352,6 +369,9 @@ private: bool _probe_eos = false; int _last_probe_match; + // For mark join, last probe index of null mark + int _last_probe_null_mark; + bool _build_side_ignore_null = false; bool _is_broadcast_join = false; diff --git a/be/src/vec/exec/join/vjoin_node_base.cpp b/be/src/vec/exec/join/vjoin_node_base.cpp index 656810ba7b..e81851b7d9 100644 --- a/be/src/vec/exec/join/vjoin_node_base.cpp +++ b/be/src/vec/exec/join/vjoin_node_base.cpp @@ -63,29 +63,33 @@ VJoinNodeBase::VJoinNodeBase(ObjectPool* pool, const TPlanNode& tnode, const Des _join_op == TJoinOp::FULL_OUTER_JOIN), _build_unique(!_have_other_join_conjunct && (_join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN || + _join_op == TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN || _join_op == TJoinOp::LEFT_ANTI_JOIN || _join_op == TJoinOp::LEFT_SEMI_JOIN)), _is_right_semi_anti(_join_op == TJoinOp::RIGHT_ANTI_JOIN || _join_op == TJoinOp::RIGHT_SEMI_JOIN), _is_left_semi_anti(_join_op == TJoinOp::LEFT_ANTI_JOIN || _join_op == TJoinOp::LEFT_SEMI_JOIN || - _join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN), + _join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN || + _join_op == TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN), _is_outer_join(_match_all_build || _match_all_probe), _is_mark_join(tnode.__isset.nested_loop_join_node - ? (tnode.nested_loop_join_node.__isset.is_mark - ? tnode.nested_loop_join_node.is_mark - : false) - : tnode.hash_join_node.__isset.is_mark ? tnode.hash_join_node.is_mark - : false), - _short_circuit_for_null_in_build_side(_join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN) { + ? tnode.nested_loop_join_node.__isset.is_mark && + tnode.nested_loop_join_node.is_mark + : tnode.hash_join_node.__isset.is_mark && + tnode.hash_join_node.is_mark), + _short_circuit_for_null_in_build_side(_join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN && + !_is_mark_join) { _init_join_op(); if (_is_mark_join) { DCHECK(_join_op == TJoinOp::LEFT_ANTI_JOIN || _join_op == TJoinOp::LEFT_SEMI_JOIN || - _join_op == TJoinOp::CROSS_JOIN || _join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN) + _join_op == TJoinOp::CROSS_JOIN || _join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN || + _join_op == TJoinOp::NULL_AWARE_LEFT_SEMI_JOIN) << "Mark join is only supported for null aware left semi/anti join and cross join " "but this is " << _join_op; } + if (tnode.__isset.hash_join_node) { _output_row_desc.reset( new RowDescriptor(descs, {tnode.hash_join_node.voutput_tuple_id}, {false})); @@ -144,8 +148,15 @@ void VJoinNodeBase::_construct_mutable_join_block() { } } - DCHECK(!_is_mark_join || - _join_block.get_by_position(_join_block.columns() - 1).column->is_nullable()); + if (_is_mark_join) { + _mark_column_id = _join_block.columns() - 1; +#ifndef NDEBUG + const auto& mark_column = assert_cast( + *_join_block.get_by_position(_mark_column_id).column); + auto& nested_column = mark_column.get_nested_column(); + DCHECK(check_and_get_column(nested_column) != nullptr); +#endif + } } Status VJoinNodeBase::_build_output_block(Block* origin_block, Block* output_block, @@ -290,7 +301,8 @@ void VJoinNodeBase::_probe_side_open_thread(RuntimeState* state, std::promise, std::integral_constant, std::integral_constant, - std::integral_constant>; + std::integral_constant, + std::integral_constant>; class VJoinNodeBase : public ExecNode { public: @@ -121,6 +122,9 @@ protected: // for some join, when build side rows is empty, we could return directly by add some additional null data in probe table. bool _empty_right_table_need_probe_dispose = false; + + size_t _mark_column_id; + std::unique_ptr _output_row_desc; std::unique_ptr _intermediate_row_desc; // output expr diff --git a/be/src/vec/exprs/vexpr_context.cpp b/be/src/vec/exprs/vexpr_context.cpp index cebb7dd2e5..7325ba5f31 100644 --- a/be/src/vec/exprs/vexpr_context.cpp +++ b/be/src/vec/exprs/vexpr_context.cpp @@ -223,6 +223,50 @@ Status VExprContext::execute_conjuncts(const VExprContextSPtrs& ctxs, return Status::OK(); } +Status VExprContext::execute_conjuncts(const VExprContextSPtrs& conjuncts, Block* block, + ColumnUInt8& null_map, IColumn::Filter& filter) { + const auto& rows = block->rows(); + if (rows == 0) { + return Status::OK(); + } + + null_map.resize(rows); + auto* final_null_map = null_map.get_data().data(); + memset(final_null_map, 0, rows); + filter.resize_fill(rows, 1); + auto* final_filter_ptr = filter.data(); + + for (const auto& conjunct : conjuncts) { + int result_column_id = -1; + RETURN_IF_ERROR(conjunct->execute(block, &result_column_id)); + auto& filter_column = + unpack_if_const(block->get_by_position(result_column_id).column).first; + if (auto* nullable_column = check_and_get_column(*filter_column)) { + const ColumnPtr& nested_column = nullable_column->get_nested_column_ptr(); + const IColumn::Filter& result = + assert_cast(*nested_column).get_data(); + auto* __restrict filter_data = result.data(); + auto* __restrict null_map_data = nullable_column->get_null_map_data().data(); + DCHECK_EQ(rows, nullable_column->size()); + + for (size_t i = 0; i != rows; ++i) { + // null and null => null + // null and true => null + // null and false => false + final_null_map[i] = (final_null_map[i] & (null_map_data[i] | filter_data[i])) | + (null_map_data[i] & (final_null_map[i] | final_filter_ptr[i])); + final_filter_ptr[i] = final_filter_ptr[i] & filter_data[i]; + } + } else { + auto* filter_data = assert_cast(*filter_column).get_data().data(); + for (size_t i = 0; i != rows; ++i) { + final_filter_ptr[i] = final_filter_ptr[i] & filter_data[i]; + } + } + } + return Status::OK(); +} + // TODO Performance Optimization // need exception safety Status VExprContext::execute_conjuncts_and_filter_block( diff --git a/be/src/vec/exprs/vexpr_context.h b/be/src/vec/exprs/vexpr_context.h index 70bd37b187..423e1aac12 100644 --- a/be/src/vec/exprs/vexpr_context.h +++ b/be/src/vec/exprs/vexpr_context.h @@ -81,6 +81,10 @@ public: IColumn::Filter* result_filter, bool* can_filter_all); + [[nodiscard]] static Status execute_conjuncts(const VExprContextSPtrs& conjuncts, Block* block, + ColumnUInt8& null_map, + IColumn::Filter& result_filter); + static Status execute_conjuncts(const VExprContextSPtrs& ctxs, const std::vector* filters, Block* block, IColumn::Filter* result_filter, bool* can_filter_all); diff --git a/be/src/vec/runtime/shared_hash_table_controller.h b/be/src/vec/runtime/shared_hash_table_controller.h index b8770e6385..a81fc994f3 100644 --- a/be/src/vec/runtime/shared_hash_table_controller.h +++ b/be/src/vec/runtime/shared_hash_table_controller.h @@ -59,6 +59,7 @@ struct SharedHashTableContext { std::shared_ptr arena; std::shared_ptr hash_table_variants; std::shared_ptr block; + std::shared_ptr> build_indexes_null; std::map runtime_filters; std::atomic signaled = false; bool short_circuit_for_null_in_probe_side = false; diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/JoinOperator.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/JoinOperator.java index a45835f3bd..bd8798bd4d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/JoinOperator.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/JoinOperator.java @@ -37,7 +37,9 @@ public enum JoinOperator { // NOT IN subqueries. It can have a single equality join conjunct // that returns TRUE when the rhs is NULL. NULL_AWARE_LEFT_ANTI_JOIN("NULL AWARE LEFT ANTI JOIN", - TJoinOp.NULL_AWARE_LEFT_ANTI_JOIN); + TJoinOp.NULL_AWARE_LEFT_ANTI_JOIN), + NULL_AWARE_LEFT_SEMI_JOIN("NULL AWARE LEFT SEMI JOIN", + TJoinOp.NULL_AWARE_LEFT_SEMI_JOIN); private final String description; private final TJoinOp thriftJoinOp; diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java index 5acda0f298..cc8b2483ff 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java @@ -1183,9 +1183,22 @@ public class PhysicalPlanTranslator extends DefaultPlanVisitor JoinUtils.swapEqualToForChildrenOrder(e, hashJoin.left().getOutputSet())) .map(e -> ExpressionTranslator.translate(e, context)) .collect(Collectors.toList()); + List markConjuncts = ImmutableList.of(); + boolean isHashJoinConjunctsEmpty = hashJoin.getHashJoinConjuncts().isEmpty(); + boolean isMarkJoinConjunctsEmpty = hashJoin.getMarkJoinConjuncts().isEmpty(); + if (isHashJoinConjunctsEmpty) { + // if hash join conjuncts is empty, means mark join conjuncts must be EqualPredicate + // BE should use mark join conjuncts to build hash table + Preconditions.checkState(!isMarkJoinConjunctsEmpty, "mark join conjuncts should not be empty."); + markConjuncts = hashJoin.getMarkJoinConjuncts().stream() + .map(EqualPredicate.class::cast) + .map(e -> JoinUtils.swapEqualToForChildrenOrder(e, hashJoin.left().getOutputSet())) + .map(e -> ExpressionTranslator.translate(e, context)) + .collect(Collectors.toList()); + } HashJoinNode hashJoinNode = new HashJoinNode(context.nextPlanNodeId(), leftPlanRoot, - rightPlanRoot, JoinType.toJoinOperator(joinType), execEqConjuncts, Lists.newArrayList(), + rightPlanRoot, JoinType.toJoinOperator(joinType), execEqConjuncts, Lists.newArrayList(), markConjuncts, null, null, null, hashJoin.isMarkJoin()); hashJoinNode.setNereidsId(hashJoin.getId()); hashJoinNode.setDistributeExprLists(distributeExprLists); @@ -1246,6 +1259,15 @@ public class PhysicalPlanTranslator extends DefaultPlanVisitor e.getInputSlots().stream()) .map(SlotReference.class::cast) .forEach(s -> hashOutputSlotReferenceMap.put(s.getExprId(), s)); + if (!isHashJoinConjunctsEmpty && !isMarkJoinConjunctsEmpty) { + // if hash join conjuncts is NOT empty, mark join conjuncts would be processed like other conjuncts + // BE should deal with mark join conjuncts differently, its result is 3 value bool(true, false, null) + hashJoin.getMarkJoinConjuncts() + .stream() + .flatMap(e -> e.getInputSlots().stream()) + .map(SlotReference.class::cast) + .forEach(s -> hashOutputSlotReferenceMap.put(s.getExprId(), s)); + } hashJoin.getFilterConjuncts().stream() .filter(e -> !(e.equals(BooleanLiteral.TRUE))) .flatMap(e -> e.getInputSlots().stream()) @@ -1271,7 +1293,7 @@ public class PhysicalPlanTranslator extends DefaultPlanVisitor rightIntermediateSlotDescriptor = Lists.newArrayList(); TupleDescriptor intermediateDescriptor = context.generateTupleDesc(); - if (hashJoin.getOtherJoinConjuncts().isEmpty() + if (hashJoin.getOtherJoinConjuncts().isEmpty() && (isHashJoinConjunctsEmpty != isMarkJoinConjunctsEmpty) && (joinType == JoinType.LEFT_ANTI_JOIN || joinType == JoinType.LEFT_SEMI_JOIN || joinType == JoinType.NULL_AWARE_LEFT_ANTI_JOIN)) { @@ -1294,7 +1316,7 @@ public class PhysicalPlanTranslator extends DefaultPlanVisitor markJoinConjuncts = hashJoin.getMarkJoinConjuncts() + .stream() + .map(e -> ExpressionTranslator.translate(e, context)) + .collect(Collectors.toList()); + hashJoinNode.setMarkJoinConjuncts(markJoinConjuncts); + } + hashJoinNode.setvIntermediateTupleDescList(Lists.newArrayList(intermediateDescriptor)); if (hashJoin.isShouldTranslateOutput()) { @@ -1564,6 +1595,12 @@ public class PhysicalPlanTranslator extends DefaultPlanVisitor markJoinConjuncts = nestedLoopJoin.getMarkJoinConjuncts().stream() + .map(e -> ExpressionTranslator.translate(e, context)).collect(Collectors.toList()); + nestedLoopJoinNode.setMarkJoinConjuncts(markJoinConjuncts); + } + nestedLoopJoin.getFilterConjuncts().stream() .filter(e -> !(e.equals(BooleanLiteral.TRUE))) .map(e -> ExpressionTranslator.translate(e, context)) @@ -1713,6 +1750,13 @@ public class PhysicalPlanTranslator extends DefaultPlanVisitor markConjuncts = ((HashJoinNode) joinNode).getMarkJoinConjuncts(); + for (Expr expr : markConjuncts) { + Expr.extractSlots(expr, requiredOtherConjunctsSlotIdSet); + } + } requiredOtherConjunctsSlotIdSet.forEach(e -> requiredExprIds.add(context.findExprId(e))); requiredSlotIdSet.forEach(e -> requiredExprIds.add(context.findExprId(e))); for (ExprId exprId : requiredExprIds) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/RuleType.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/RuleType.java index 5f4c784d7d..6a994c1b6e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/RuleType.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/RuleType.java @@ -211,6 +211,7 @@ public enum RuleType { ELIMINATE_NOT_NULL(RuleTypeClass.REWRITE), ELIMINATE_UNNECESSARY_PROJECT(RuleTypeClass.REWRITE), ELIMINATE_OUTER_JOIN(RuleTypeClass.REWRITE), + ELIMINATE_MARK_JOIN(RuleTypeClass.REWRITE), ELIMINATE_GROUP_BY(RuleTypeClass.REWRITE), ELIMINATE_JOIN_BY_UK(RuleTypeClass.REWRITE), ELIMINATE_DEDUP_JOIN_CONDITION(RuleTypeClass.REWRITE), diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/CollectJoinConstraint.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/CollectJoinConstraint.java index 42bc6d1f9b..459118435c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/CollectJoinConstraint.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/CollectJoinConstraint.java @@ -47,7 +47,7 @@ public class CollectJoinConstraint implements RewriteRuleFactory { @Override public List buildRules() { return ImmutableList.of( - logicalJoin().thenApply(ctx -> { + logicalJoin().whenNot(LogicalJoin::isMarkJoin).thenApply(ctx -> { if (!ctx.cascadesContext.isLeadingJoin()) { return ctx.root; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/SubqueryToApply.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/SubqueryToApply.java index 12eed92b40..b2f202aad8 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/SubqueryToApply.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/SubqueryToApply.java @@ -21,6 +21,7 @@ import org.apache.doris.nereids.CascadesContext; import org.apache.doris.nereids.StatementContext; import org.apache.doris.nereids.rules.Rule; import org.apache.doris.nereids.rules.RuleType; +import org.apache.doris.nereids.rules.expression.rules.TrySimplifyPredicateWithMarkJoinSlot; import org.apache.doris.nereids.trees.TreeNode; import org.apache.doris.nereids.trees.expressions.Alias; import org.apache.doris.nereids.trees.expressions.And; @@ -49,6 +50,7 @@ import org.apache.doris.nereids.trees.plans.logical.LogicalOneRowRelation; import org.apache.doris.nereids.trees.plans.logical.LogicalPlan; import org.apache.doris.nereids.trees.plans.logical.LogicalProject; import org.apache.doris.nereids.trees.plans.logical.LogicalSort; +import org.apache.doris.nereids.util.ExpressionUtils; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableSet; @@ -75,8 +77,6 @@ public class SubqueryToApply implements AnalysisRuleFactory { RuleType.FILTER_SUBQUERY_TO_APPLY.build( logicalFilter().thenApply(ctx -> { LogicalFilter filter = ctx.root; - boolean shouldOutputMarkJoinSlot = filter.getConjuncts().stream() - .anyMatch(expr -> shouldOutputMarkJoinSlot(expr, SearchState.SearchNot)); ImmutableList> subqueryExprsList = filter.getConjuncts().stream() .>map(e -> e.collect(SubqueryToApply::canConvertToSupply)) .collect(ImmutableList.toImmutableList()); @@ -84,6 +84,11 @@ public class SubqueryToApply implements AnalysisRuleFactory { .flatMap(Collection::stream).noneMatch(SubqueryExpr.class::isInstance)) { return filter; } + ImmutableList shouldOutputMarkJoinSlot = + filter.getConjuncts().stream() + .map(expr -> !(expr instanceof SubqueryExpr) + && expr.containsType(SubqueryExpr.class)) + .collect(ImmutableList.toImmutableList()); List oldConjuncts = ImmutableList.copyOf(filter.getConjuncts()); ImmutableList.Builder newConjuncts = new ImmutableList.Builder<>(); @@ -101,15 +106,29 @@ public class SubqueryToApply implements AnalysisRuleFactory { // first step: Replace the subquery of predicate in LogicalFilter // second step: Replace subquery with LogicalApply ReplaceSubquery replaceSubquery = new ReplaceSubquery( - ctx.statementContext, shouldOutputMarkJoinSlot); + ctx.statementContext, shouldOutputMarkJoinSlot.get(i)); SubqueryContext context = new SubqueryContext(subqueryExprs); Expression conjunct = replaceSubquery.replace(oldConjuncts.get(i), context); + /* + * the idea is replacing each mark join slot with null and false literal + * then run FoldConstant rule, if the evaluate result are: + * 1. all true + * 2. all null and false (in logicalFilter, we discard both null and false values) + * the mark slot can be non-nullable boolean + * we pass this info to LogicalApply. And in InApplyToJoin rule + * if it's semi join with non-null mark slot + * we can safely change the mark conjunct to hash conjunct + */ + boolean isMarkSlotNotNull = conjunct.containsType(MarkJoinSlotReference.class) + ? ExpressionUtils.canInferNotNullForMarkSlot( + TrySimplifyPredicateWithMarkJoinSlot.INSTANCE.rewrite(conjunct, null)) + : false; applyPlan = subqueryToApply(subqueryExprs.stream() .collect(ImmutableList.toImmutableList()), tmpPlan, context.getSubqueryToMarkJoinSlot(), ctx.cascadesContext, - Optional.of(conjunct), false); + Optional.of(conjunct), false, isMarkSlotNotNull); tmpPlan = applyPlan; newConjuncts.add(conjunct); } @@ -120,8 +139,9 @@ public class SubqueryToApply implements AnalysisRuleFactory { return new LogicalProject<>(applyPlan.getOutput().stream() .filter(s -> !(s instanceof MarkJoinSlotReference)) .collect(ImmutableList.toImmutableList()), newFilter); + } else { + return newFilter; } - return new LogicalFilter<>(conjuncts, applyPlan); }) ), RuleType.PROJECT_SUBQUERY_TO_APPLY.build(logicalProject().thenApply(ctx -> { @@ -155,7 +175,7 @@ public class SubqueryToApply implements AnalysisRuleFactory { subqueryExprs.stream().collect(ImmutableList.toImmutableList()), childPlan, context.getSubqueryToMarkJoinSlot(), ctx.cascadesContext, - Optional.of(newProject), true); + Optional.of(newProject), true, false); childPlan = applyPlan; newProjects.add((NamedExpression) newProject); } @@ -216,12 +236,25 @@ public class SubqueryToApply implements AnalysisRuleFactory { ReplaceSubquery replaceSubquery = new ReplaceSubquery(ctx.statementContext, true); SubqueryContext context = new SubqueryContext(subqueryExprs); Expression conjunct = replaceSubquery.replace(subqueryConjuncts.get(i), context); - + /* + * the idea is replacing each mark join slot with null and false literal + * then run FoldConstant rule, if the evaluate result are: + * 1. all true + * 2. all null and false (in logicalFilter, we discard both null and false values) + * the mark slot can be non-nullable boolean + * we pass this info to LogicalApply. And in InApplyToJoin rule + * if it's semi join with non-null mark slot + * we can safely change the mark conjunct to hash conjunct + */ + boolean isMarkSlotNotNull = conjunct.containsType(MarkJoinSlotReference.class) + ? ExpressionUtils.canInferNotNullForMarkSlot( + TrySimplifyPredicateWithMarkJoinSlot.INSTANCE.rewrite(conjunct, null)) + : false; applyPlan = subqueryToApply( subqueryExprs.stream().collect(ImmutableList.toImmutableList()), relatedInfoList.get(i) == RelatedInfo.RelatedToLeft ? leftChildPlan : rightChildPlan, context.getSubqueryToMarkJoinSlot(), - ctx.cascadesContext, Optional.of(conjunct), false); + ctx.cascadesContext, Optional.of(conjunct), false, isMarkSlotNotNull); if (relatedInfoList.get(i) == RelatedInfo.RelatedToLeft) { leftChildPlan = applyPlan; } else { @@ -282,7 +315,7 @@ public class SubqueryToApply implements AnalysisRuleFactory { SubqueryExpr subqueryExpr = subqueryExprs.get(0); List correlatedSlots = subqueryExpr.getCorrelateSlots(); if (subqueryExpr instanceof ScalarSubquery) { - Set inputSlots = expression.getInputSlots(); + Set inputSlots = subqueryExpr.getInputSlots(); if (correlatedSlots.isEmpty() && inputSlots.isEmpty()) { relatedInfo = RelatedInfo.Unrelated; } else if (leftOutputSlots.containsAll(inputSlots) @@ -322,7 +355,8 @@ public class SubqueryToApply implements AnalysisRuleFactory { private LogicalPlan subqueryToApply(List subqueryExprs, LogicalPlan childPlan, Map> subqueryToMarkJoinSlot, CascadesContext ctx, - Optional conjunct, boolean isProject) { + Optional conjunct, boolean isProject, + boolean isMarkJoinSlotNotNull) { LogicalPlan tmpPlan = childPlan; for (int i = 0; i < subqueryExprs.size(); ++i) { SubqueryExpr subqueryExpr = subqueryExprs.get(i); @@ -336,7 +370,7 @@ public class SubqueryToApply implements AnalysisRuleFactory { if (!ctx.subqueryIsAnalyzed(subqueryExpr)) { tmpPlan = addApply(subqueryExpr, tmpPlan, subqueryToMarkJoinSlot, ctx, conjunct, - isProject, subqueryExprs.size() == 1); + isProject, subqueryExprs.size() == 1, isMarkJoinSlotNotNull); } } return tmpPlan; @@ -354,7 +388,7 @@ public class SubqueryToApply implements AnalysisRuleFactory { private LogicalPlan addApply(SubqueryExpr subquery, LogicalPlan childPlan, Map> subqueryToMarkJoinSlot, CascadesContext ctx, Optional conjunct, - boolean isProject, boolean singleSubquery) { + boolean isProject, boolean singleSubquery, boolean isMarkJoinSlotNotNull) { ctx.setSubqueryExprIsAnalyzed(subquery, true); boolean needAddScalarSubqueryOutputToProjects = isConjunctContainsScalarSubqueryOutput( subquery, conjunct, isProject, singleSubquery); @@ -362,7 +396,7 @@ public class SubqueryToApply implements AnalysisRuleFactory { subquery.getCorrelateSlots(), subquery, Optional.empty(), subqueryToMarkJoinSlot.get(subquery), - needAddScalarSubqueryOutputToProjects, isProject, + needAddScalarSubqueryOutputToProjects, isProject, isMarkJoinSlotNotNull, childPlan, subquery.getQueryPlan()); List projects = ImmutableList.builder() diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/expression/ExpressionRewrite.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/expression/ExpressionRewrite.java index ebe27c0cfe..116cf426a4 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/expression/ExpressionRewrite.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/expression/ExpressionRewrite.java @@ -17,6 +17,7 @@ package org.apache.doris.nereids.rules.expression; +import org.apache.doris.common.Pair; import org.apache.doris.nereids.properties.OrderKey; import org.apache.doris.nereids.rules.Rule; import org.apache.doris.nereids.rules.RuleType; @@ -39,7 +40,6 @@ import org.apache.doris.nereids.util.ExpressionUtils; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableSet; -import com.google.common.collect.Lists; import java.util.ArrayList; import java.util.List; @@ -179,34 +179,39 @@ public class ExpressionRewrite implements RewriteRuleFactory { LogicalJoin join = ctx.root; List hashJoinConjuncts = join.getHashJoinConjuncts(); List otherJoinConjuncts = join.getOtherJoinConjuncts(); - if (otherJoinConjuncts.isEmpty() && hashJoinConjuncts.isEmpty()) { + List markJoinConjuncts = join.getMarkJoinConjuncts(); + if (otherJoinConjuncts.isEmpty() && hashJoinConjuncts.isEmpty() + && markJoinConjuncts.isEmpty()) { return join; } + ExpressionRewriteContext context = new ExpressionRewriteContext(ctx.cascadesContext); - List rewriteHashJoinConjuncts = Lists.newArrayList(); - boolean hashJoinConjunctsChanged = false; - for (Expression expr : hashJoinConjuncts) { - Expression newExpr = rewriter.rewrite(expr, context); - hashJoinConjunctsChanged = hashJoinConjunctsChanged || !newExpr.equals(expr); - rewriteHashJoinConjuncts.addAll(ExpressionUtils.extractConjunction(newExpr)); - } + Pair> newHashJoinConjuncts = rewriteConjuncts(hashJoinConjuncts, context); + Pair> newOtherJoinConjuncts = rewriteConjuncts(otherJoinConjuncts, context); + Pair> newMarkJoinConjuncts = rewriteConjuncts(markJoinConjuncts, context); - List rewriteOtherJoinConjuncts = Lists.newArrayList(); - boolean otherJoinConjunctsChanged = false; - for (Expression expr : otherJoinConjuncts) { - Expression newExpr = rewriter.rewrite(expr, context); - otherJoinConjunctsChanged = otherJoinConjunctsChanged || !newExpr.equals(expr); - rewriteOtherJoinConjuncts.addAll(ExpressionUtils.extractConjunction(newExpr)); - } - - if (!hashJoinConjunctsChanged && !otherJoinConjunctsChanged) { + if (!newHashJoinConjuncts.first && !newOtherJoinConjuncts.first + && !newMarkJoinConjuncts.first) { return join; } - return new LogicalJoin<>(join.getJoinType(), rewriteHashJoinConjuncts, - rewriteOtherJoinConjuncts, join.getDistributeHint(), join.getMarkJoinSlotReference(), - join.children()); + + return new LogicalJoin<>(join.getJoinType(), newHashJoinConjuncts.second, + newOtherJoinConjuncts.second, newMarkJoinConjuncts.second, + join.getDistributeHint(), join.getMarkJoinSlotReference(), join.children()); }).toRule(RuleType.REWRITE_JOIN_EXPRESSION); } + + private Pair> rewriteConjuncts(List conjuncts, + ExpressionRewriteContext context) { + boolean isChanged = false; + ImmutableList.Builder rewrittenConjuncts = new ImmutableList.Builder<>(); + for (Expression expr : conjuncts) { + Expression newExpr = rewriter.rewrite(expr, context); + isChanged = isChanged || !newExpr.equals(expr); + rewrittenConjuncts.addAll(ExpressionUtils.extractConjunction(newExpr)); + } + return Pair.of(isChanged, rewrittenConjuncts.build()); + } } private class SortExpressionRewrite extends OneRewriteRuleFactory { diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/expression/rules/TrySimplifyPredicateWithMarkJoinSlot.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/expression/rules/TrySimplifyPredicateWithMarkJoinSlot.java new file mode 100644 index 0000000000..d4dc6697d8 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/expression/rules/TrySimplifyPredicateWithMarkJoinSlot.java @@ -0,0 +1,113 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.rules.expression.rules; + +import org.apache.doris.nereids.rules.expression.AbstractExpressionRewriteRule; +import org.apache.doris.nereids.rules.expression.ExpressionRewriteContext; +import org.apache.doris.nereids.trees.expressions.And; +import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.expressions.MarkJoinSlotReference; +import org.apache.doris.nereids.trees.expressions.Or; +import org.apache.doris.nereids.trees.expressions.literal.BooleanLiteral; + +/** + * TrySimplifyPredicateWithMarkJoinSlot + */ +public class TrySimplifyPredicateWithMarkJoinSlot extends AbstractExpressionRewriteRule { + public static final TrySimplifyPredicateWithMarkJoinSlot INSTANCE = + new TrySimplifyPredicateWithMarkJoinSlot(); + + @Override + public Expression visitAnd(And and, ExpressionRewriteContext context) { + /* + * predicate(with mark slot) and predicate(no mark slot) + * false and TRUE -> false(*) -> discard + * false and NULL -> null -> discard + * false and FALSE -> false -> discard + * + * null and TRUE -> null(*) -> discard + * null and NULL -> null -> discard + * null and FALSE -> false -> discard + * + * true and TRUE -> true(x) -> keep + * true and NULL -> null -> discard + * true and FALSE -> false -> discard + * + * we can see only 'predicate(with mark slot) and TRUE' may produce different results(*) + * because in filter predicate, we discard null and false values and only keep true values + * we can substitute mark slot with null and false to evaluate the predicate + * if the result are true, or result is either false or null, we can use non-nullable mark slot + * see ExpressionUtils.canInferNotNullForMarkSlot for more info + * we change 'predicate(with mark slot) and predicate(no mark slot)' -> predicate(with mark slot) and true + * to evaluate the predicate + */ + Expression left = and.left(); + Expression newLeft = left.accept(this, context); + + if (newLeft.getInputSlots().stream().noneMatch(MarkJoinSlotReference.class::isInstance)) { + newLeft = BooleanLiteral.TRUE; + } + + Expression right = and.right(); + Expression newRight = right.accept(this, context); + if (newRight.getInputSlots().stream().noneMatch(MarkJoinSlotReference.class::isInstance)) { + newRight = BooleanLiteral.TRUE; + } + Expression expr = new And(newLeft, newRight); + return expr; + } + + @Override + public Expression visitOr(Or or, ExpressionRewriteContext context) { + /* + * predicate(with mark slot) or predicate(no mark slot) + * false or TRUE -> true -> keep + * false or NULL -> null(^) -> discard + * false or FALSE -> false(*) -> discard + * + * null or TRUE -> true -> keep + * null or NULL -> null(^) -> discard + * null or FALSE -> null(*) -> discard + * + * true or TRUE -> true -> keep + * true or NULL -> true(#) -> keep + * true or FALSE -> true(x) -> keep + * + * like And operator, even there are more differences. we can get the same conclusion. + * by substituting mark slot with null and false to evaluate the predicate + * if the result are true, or result is either false or null, we can use non-nullable mark slot + * we change 'predicate(with mark slot) or predicate(no mark slot)' -> predicate(with mark slot) or false + * to evaluate the predicate + */ + Expression left = or.left(); + Expression newLeft = left.accept(this, context); + + if (newLeft.getInputSlots().stream().noneMatch(MarkJoinSlotReference.class::isInstance)) { + newLeft = BooleanLiteral.FALSE; + } + + Expression right = or.right(); + Expression newRight = right.accept(this, context); + if (newRight.getInputSlots().stream().noneMatch(MarkJoinSlotReference.class::isInstance)) { + newRight = BooleanLiteral.FALSE; + } + Expression expr = new Or(newLeft, newRight); + return expr; + } + +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/implementation/LogicalJoinToHashJoin.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/implementation/LogicalJoinToHashJoin.java index 1ab830d57b..a8de496af4 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/implementation/LogicalJoinToHashJoin.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/implementation/LogicalJoinToHashJoin.java @@ -34,6 +34,7 @@ public class LogicalJoinToHashJoin extends OneImplementationRuleFactory { join.getJoinType(), join.getHashJoinConjuncts(), join.getOtherJoinConjuncts(), + join.getMarkJoinConjuncts(), join.getDistributeHint(), join.getMarkJoinSlotReference(), join.getLogicalProperties(), diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/implementation/LogicalJoinToNestedLoopJoin.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/implementation/LogicalJoinToNestedLoopJoin.java index af02f6a2ed..278d90a763 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/implementation/LogicalJoinToNestedLoopJoin.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/implementation/LogicalJoinToNestedLoopJoin.java @@ -34,6 +34,7 @@ public class LogicalJoinToNestedLoopJoin extends OneImplementationRuleFactory { join.getJoinType(), join.getHashJoinConjuncts(), join.getOtherJoinConjuncts(), + join.getMarkJoinConjuncts(), join.getMarkJoinSlotReference(), join.getLogicalProperties(), join.left(), diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/AdjustConjunctsReturnType.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/AdjustConjunctsReturnType.java index 9c1a60fdf5..a3c4263b2b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/AdjustConjunctsReturnType.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/AdjustConjunctsReturnType.java @@ -64,6 +64,9 @@ public class AdjustConjunctsReturnType extends DefaultPlanRewriter impleme List otherConjuncts = join.getOtherJoinConjuncts().stream() .map(expr -> TypeCoercionUtils.castIfNotSameType(expr, BooleanType.INSTANCE)) .collect(Collectors.toList()); - return join.withJoinConjuncts(hashConjuncts, otherConjuncts); + List markConjuncts = join.getMarkJoinConjuncts().stream() + .map(expr -> TypeCoercionUtils.castIfNotSameType(expr, BooleanType.INSTANCE)) + .collect(Collectors.toList()); + return join.withJoinConjuncts(hashConjuncts, otherConjuncts, markConjuncts); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/AdjustNullable.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/AdjustNullable.java index 8f70b86e4b..7bb4251f16 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/AdjustNullable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/AdjustNullable.java @@ -49,6 +49,7 @@ import org.apache.doris.nereids.trees.plans.visitor.CustomRewriter; import org.apache.doris.nereids.trees.plans.visitor.DefaultPlanRewriter; import org.apache.doris.nereids.util.ExpressionUtils; +import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Lists; @@ -117,9 +118,22 @@ public class AdjustNullable extends DefaultPlanRewriter> imple public Plan visitLogicalJoin(LogicalJoin join, Map replaceMap) { join = (LogicalJoin) super.visit(join, replaceMap); List hashConjuncts = updateExpressions(join.getHashJoinConjuncts(), replaceMap); + List markConjuncts; + if (hashConjuncts.isEmpty()) { + // if hashConjuncts is empty, mark join conjuncts may used to build hash table + // so need call updateExpressions for mark join conjuncts before adjust nullable by output slot + markConjuncts = updateExpressions(join.getMarkJoinConjuncts(), replaceMap); + } else { + markConjuncts = null; + } join.getOutputSet().forEach(o -> replaceMap.put(o.getExprId(), o)); + if (markConjuncts == null) { + // hashConjuncts is not empty, mark join conjuncts are processed like other join conjuncts + Preconditions.checkState(!hashConjuncts.isEmpty(), "hash conjuncts should not be empty"); + markConjuncts = updateExpressions(join.getMarkJoinConjuncts(), replaceMap); + } List otherConjuncts = updateExpressions(join.getOtherJoinConjuncts(), replaceMap); - return join.withJoinConjuncts(hashConjuncts, otherConjuncts).recomputeLogicalProperties(); + return join.withJoinConjuncts(hashConjuncts, otherConjuncts, markConjuncts).recomputeLogicalProperties(); } @Override diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/CheckDataTypes.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/CheckDataTypes.java index d257868491..875bf42bab 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/CheckDataTypes.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/CheckDataTypes.java @@ -33,6 +33,7 @@ import org.apache.doris.nereids.types.UnsupportedType; import com.google.common.collect.ImmutableSet; +import java.util.List; import java.util.Set; /** @@ -58,7 +59,12 @@ public class CheckDataTypes implements CustomRewriter { } private void checkLogicalJoin(LogicalJoin plan) { - plan.getHashJoinConjuncts().forEach(expr -> { + List conjuncts = plan.getHashJoinConjuncts(); + if (conjuncts.isEmpty()) { + // if hash conjuncts are empty, we may use mark conjuncts to build hash table + conjuncts = plan.getMarkJoinConjuncts(); + } + conjuncts.forEach(expr -> { DataType leftType = expr.child(0).getDataType(); DataType rightType = expr.child(1).getDataType(); if (!leftType.acceptsType(rightType)) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/ConvertInnerOrCrossJoin.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/ConvertInnerOrCrossJoin.java index fd41d2deea..6f67ce9fc9 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/ConvertInnerOrCrossJoin.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/ConvertInnerOrCrossJoin.java @@ -37,11 +37,13 @@ public class ConvertInnerOrCrossJoin implements RewriteRuleFactory { public List buildRules() { return ImmutableList.of( innerLogicalJoin() - .when(join -> join.getHashJoinConjuncts().size() == 0 && join.getOtherJoinConjuncts().size() == 0) + .when(join -> join.getHashJoinConjuncts().isEmpty() && join.getOtherJoinConjuncts().isEmpty() + && join.getMarkJoinConjuncts().isEmpty()) .then(join -> join.withJoinType(JoinType.CROSS_JOIN)) .toRule(RuleType.INNER_TO_CROSS_JOIN), crossLogicalJoin() - .when(join -> join.getHashJoinConjuncts().size() != 0 || join.getOtherJoinConjuncts().size() != 0) + .when(join -> !join.getHashJoinConjuncts().isEmpty() || !join.getOtherJoinConjuncts().isEmpty() + || !join.getMarkJoinConjuncts().isEmpty()) .then(join -> join.withJoinType(JoinType.INNER_JOIN)) .toRule(RuleType.CROSS_TO_INNER_JOIN) ); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/EliminateDedupJoinCondition.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/EliminateDedupJoinCondition.java index eeabd177ff..f8baaefb39 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/EliminateDedupJoinCondition.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/EliminateDedupJoinCondition.java @@ -37,11 +37,14 @@ public class EliminateDedupJoinCondition extends OneRewriteRuleFactory { .distinct().collect(Collectors.toList()); List dedupOtherJoinConjuncts = join.getOtherJoinConjuncts().stream() .distinct().collect(Collectors.toList()); + List dedupMarkJoinConjuncts = join.getMarkJoinConjuncts().stream() + .distinct().collect(Collectors.toList()); if (dedupHashJoinConjuncts.size() == join.getHashJoinConjuncts().size() - && dedupOtherJoinConjuncts.size() == join.getOtherJoinConjuncts().size()) { + && dedupOtherJoinConjuncts.size() == join.getOtherJoinConjuncts().size() + && dedupMarkJoinConjuncts.size() == join.getMarkJoinConjuncts().size()) { return null; } - return join.withJoinConjuncts(dedupHashJoinConjuncts, dedupOtherJoinConjuncts); + return join.withJoinConjuncts(dedupHashJoinConjuncts, dedupOtherJoinConjuncts, dedupMarkJoinConjuncts); }).toRule(RuleType.ELIMINATE_DEDUP_JOIN_CONDITION); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/EliminateJoinCondition.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/EliminateJoinCondition.java index 37e4cb85ce..c997ad328a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/EliminateJoinCondition.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/EliminateJoinCondition.java @@ -39,11 +39,15 @@ public class EliminateJoinCondition extends OneRewriteRuleFactory { List otherJoinConjuncts = join.getOtherJoinConjuncts().stream() .filter(expression -> !expression.equals(BooleanLiteral.TRUE)) .collect(Collectors.toList()); + List markJoinConjuncts = join.getMarkJoinConjuncts().stream() + .filter(expression -> !expression.equals(BooleanLiteral.TRUE)) + .collect(Collectors.toList()); if (hashJoinConjuncts.size() == join.getHashJoinConjuncts().size() - && otherJoinConjuncts.size() == join.getOtherJoinConjuncts().size()) { + && otherJoinConjuncts.size() == join.getOtherJoinConjuncts().size() + && markJoinConjuncts.size() == join.getMarkJoinConjuncts().size()) { return null; } - return join.withJoinConjuncts(hashJoinConjuncts, otherJoinConjuncts); + return join.withJoinConjuncts(hashJoinConjuncts, otherJoinConjuncts, markJoinConjuncts); }).toRule(RuleType.ELIMINATE_JOIN_CONDITION); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/EliminateMarkJoin.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/EliminateMarkJoin.java new file mode 100644 index 0000000000..a5d59291cc --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/EliminateMarkJoin.java @@ -0,0 +1,59 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.rules.rewrite; + +import org.apache.doris.nereids.rules.Rule; +import org.apache.doris.nereids.rules.RuleType; +import org.apache.doris.nereids.rules.expression.rules.TrySimplifyPredicateWithMarkJoinSlot; +import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.plans.Plan; +import org.apache.doris.nereids.trees.plans.logical.LogicalJoin; +import org.apache.doris.nereids.util.ExpressionUtils; + +import com.google.common.collect.ImmutableList; + +import java.util.Set; + +/** + * Eliminate mark join. + */ +public class EliminateMarkJoin extends OneRewriteRuleFactory { + + @Override + public Rule build() { + return logicalFilter(logicalJoin().when( + join -> join.getJoinType().isSemiJoin() && !join.getMarkJoinConjuncts().isEmpty())) + .when(filter -> canSimplifyMarkJoin(filter.getConjuncts())) + .then(filter -> filter.withChildren(eliminateMarkJoin(filter.child()))) + .toRule(RuleType.ELIMINATE_MARK_JOIN); + } + + private boolean canSimplifyMarkJoin(Set predicates) { + return ExpressionUtils + .canInferNotNullForMarkSlot(TrySimplifyPredicateWithMarkJoinSlot.INSTANCE + .rewrite(ExpressionUtils.and(predicates), null)); + } + + private LogicalJoin eliminateMarkJoin(LogicalJoin join) { + ImmutableList.Builder newHashConjuncts = ImmutableList.builder(); + newHashConjuncts.addAll(join.getHashJoinConjuncts()); + newHashConjuncts.addAll(join.getMarkJoinConjuncts()); + return join.withJoinConjuncts(newHashConjuncts.build(), join.getOtherJoinConjuncts(), + ExpressionUtils.EMPTY_CONDITION); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/EliminateNullAwareLeftAntiJoin.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/EliminateNullAwareLeftAntiJoin.java index fd62dc6be9..0621d23575 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/EliminateNullAwareLeftAntiJoin.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/EliminateNullAwareLeftAntiJoin.java @@ -31,7 +31,10 @@ public class EliminateNullAwareLeftAntiJoin extends OneRewriteRuleFactory { @Override public Rule build() { return nullAwareLeftAntiLogicalJoin().then(antiJoin -> { - if (Stream.concat(antiJoin.getHashJoinConjuncts().stream(), antiJoin.getOtherJoinConjuncts().stream()) + if (Stream.concat(Stream.concat( + antiJoin.getHashJoinConjuncts().stream(), + antiJoin.getOtherJoinConjuncts().stream()), + antiJoin.getMarkJoinConjuncts().stream()) .noneMatch(expression -> expression.nullable())) { return antiJoin.withJoinType(JoinType.LEFT_ANTI_JOIN); } else { diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/EliminateSemiJoin.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/EliminateSemiJoin.java index 3c9f09e931..f514a1d14e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/EliminateSemiJoin.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/EliminateSemiJoin.java @@ -24,6 +24,7 @@ import org.apache.doris.nereids.trees.expressions.StatementScopeIdGenerator; import org.apache.doris.nereids.trees.expressions.literal.BooleanLiteral; import org.apache.doris.nereids.trees.plans.JoinType; import org.apache.doris.nereids.trees.plans.logical.LogicalEmptyRelation; +import org.apache.doris.nereids.trees.plans.logical.LogicalJoin; import java.util.List; @@ -35,6 +36,7 @@ public class EliminateSemiJoin extends OneRewriteRuleFactory { public Rule build() { return logicalJoin() // right will be converted to left + .whenNot(LogicalJoin::isMarkJoin) .when(join -> join.getJoinType().isLeftSemiOrAntiJoin()) .when(join -> join.getHashJoinConjuncts().isEmpty()) .then(join -> { diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/ExtractFilterFromCrossJoin.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/ExtractFilterFromCrossJoin.java index 5d360d1a0e..f5b2e41502 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/ExtractFilterFromCrossJoin.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/ExtractFilterFromCrossJoin.java @@ -40,7 +40,9 @@ public class ExtractFilterFromCrossJoin extends OneRewriteRuleFactory { return crossLogicalJoin() .then(join -> { LogicalJoin newJoin = new LogicalJoin<>(JoinType.CROSS_JOIN, - ExpressionUtils.EMPTY_CONDITION, ExpressionUtils.EMPTY_CONDITION, join.getDistributeHint(), + ExpressionUtils.EMPTY_CONDITION, ExpressionUtils.EMPTY_CONDITION, + join.getMarkJoinConjuncts(), + join.getDistributeHint(), join.getMarkJoinSlotReference(), join.children()); Set predicates = Stream.concat(join.getHashJoinConjuncts().stream(), join.getOtherJoinConjuncts().stream()) diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/FindHashConditionForJoin.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/FindHashConditionForJoin.java index aeeeefb593..a3d2cacfb6 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/FindHashConditionForJoin.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/FindHashConditionForJoin.java @@ -73,6 +73,7 @@ public class FindHashConditionForJoin extends OneRewriteRuleFactory { return new LogicalJoin<>(joinType, combinedHashJoinConjuncts, remainedNonHashJoinConjuncts, + join.getMarkJoinConjuncts(), join.getDistributeHint(), join.getMarkJoinSlotReference(), join.children()); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/InApplyToJoin.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/InApplyToJoin.java index 054b92c9f7..e3702f2a93 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/InApplyToJoin.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/InApplyToJoin.java @@ -100,32 +100,54 @@ public class InApplyToJoin extends OneRewriteRuleFactory { // TODO: trick here, because when deep copy logical plan the apply right child // is not same with query plan in subquery expr, since the scan node copy twice Expression right = inSubquery.getSubqueryOutput((LogicalPlan) apply.right()); - if (apply.isCorrelated()) { - if (inSubquery.isNot()) { - predicate = ExpressionUtils.and(ExpressionUtils.or(new EqualTo(left, right), - new IsNull(left), new IsNull(right)), - apply.getCorrelationFilter().get()); - } else { - predicate = ExpressionUtils.and(new EqualTo(left, right), - apply.getCorrelationFilter().get()); - } - } else { + if (apply.isMarkJoin()) { + List joinConjuncts = apply.getCorrelationFilter().isPresent() + ? ExpressionUtils.extractConjunction(apply.getCorrelationFilter().get()) + : Lists.newArrayList(); predicate = new EqualTo(left, right); - } - - List conjuncts = ExpressionUtils.extractConjunction(predicate); - if (inSubquery.isNot()) { + List markConjuncts = Lists.newArrayList(predicate); + if (!predicate.nullable() || (apply.isMarkJoinSlotNotNull() && !inSubquery.isNot())) { + // we can merge mark conjuncts with hash conjuncts in 2 scenarios + // 1. the mark join predicate is not nullable, so no null value would be produced + // 2. semi join with non-nullable mark slot. + // because semi join only care about mark slot with true value and discard false and null + // it's safe the use false instead of null in this case + joinConjuncts.addAll(markConjuncts); + markConjuncts.clear(); + } return new LogicalJoin<>( - predicate.nullable() && !apply.isCorrelated() - ? JoinType.NULL_AWARE_LEFT_ANTI_JOIN - : JoinType.LEFT_ANTI_JOIN, - Lists.newArrayList(), conjuncts, new DistributeHint(DistributeType.NONE), - apply.getMarkJoinSlotReference(), apply.children()); - } else { - return new LogicalJoin<>(JoinType.LEFT_SEMI_JOIN, Lists.newArrayList(), - conjuncts, + inSubquery.isNot() ? JoinType.LEFT_ANTI_JOIN : JoinType.LEFT_SEMI_JOIN, + Lists.newArrayList(), joinConjuncts, markConjuncts, new DistributeHint(DistributeType.NONE), apply.getMarkJoinSlotReference(), apply.children()); + } else { + if (apply.isCorrelated()) { + if (inSubquery.isNot()) { + predicate = ExpressionUtils.and(ExpressionUtils.or(new EqualTo(left, right), + new IsNull(left), new IsNull(right)), + apply.getCorrelationFilter().get()); + } else { + predicate = ExpressionUtils.and(new EqualTo(left, right), + apply.getCorrelationFilter().get()); + } + } else { + predicate = new EqualTo(left, right); + } + + List conjuncts = ExpressionUtils.extractConjunction(predicate); + if (inSubquery.isNot()) { + return new LogicalJoin<>( + predicate.nullable() && !apply.isCorrelated() + ? JoinType.NULL_AWARE_LEFT_ANTI_JOIN + : JoinType.LEFT_ANTI_JOIN, + Lists.newArrayList(), conjuncts, new DistributeHint(DistributeType.NONE), + apply.getMarkJoinSlotReference(), apply.children()); + } else { + return new LogicalJoin<>(JoinType.LEFT_SEMI_JOIN, Lists.newArrayList(), + conjuncts, + new DistributeHint(DistributeType.NONE), apply.getMarkJoinSlotReference(), + apply.children()); + } } }).toRule(RuleType.IN_APPLY_TO_JOIN); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/OrExpansion.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/OrExpansion.java index 1e4d7793ba..3968f817e8 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/OrExpansion.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/OrExpansion.java @@ -74,6 +74,7 @@ public class OrExpansion extends OneExplorationRuleFactory { @Override public Rule build() { return logicalJoin(any(), any()).when(JoinUtils::shouldNestedLoopJoin) + .whenNot(LogicalJoin::isMarkJoin) .when(join -> supportJoinType.contains(join.getJoinType()) && ConnectContext.get().getSessionVariable().getEnablePipelineEngine()) .thenApply(ctx -> { diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/PushDownAliasThroughJoin.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/PushDownAliasThroughJoin.java index a544dfa361..0618ffc5f5 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/PushDownAliasThroughJoin.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/PushDownAliasThroughJoin.java @@ -96,8 +96,9 @@ public class PushDownAliasThroughJoin extends OneRewriteRuleFactory { List newHash = replaceJoinConjuncts(join.getHashJoinConjuncts(), replaceMap); List newOther = replaceJoinConjuncts(join.getOtherJoinConjuncts(), replaceMap); + List newMark = replaceJoinConjuncts(join.getMarkJoinConjuncts(), replaceMap); - Plan newJoin = join.withConjunctsChildren(newHash, newOther, left, right); + Plan newJoin = join.withConjunctsChildren(newHash, newOther, newMark, left, right); return project.withProjectsAndChild(newProjects, newJoin); }).toRule(RuleType.PUSH_DOWN_ALIAS_THROUGH_JOIN); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/PushDownExpressionsInHashCondition.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/PushDownExpressionsInHashCondition.java index 4aeb47a9d4..594498cf6b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/PushDownExpressionsInHashCondition.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/PushDownExpressionsInHashCondition.java @@ -99,6 +99,19 @@ public class PushDownExpressionsInHashCondition extends OneRewriteRuleFactory { } }); + // add mark conjuncts used slots to project exprs + join.getMarkJoinConjuncts().stream().flatMap(conjunct -> + conjunct.getInputSlots().stream() + ).forEach(slot -> { + if (leftExprIdSet.contains(slot.getExprId())) { + // belong to left child + leftProjectExprs.add(slot); + } else { + // belong to right child + rightProjectExprs.add(slot); + } + }); + List newHashConjuncts = join.getHashJoinConjuncts().stream() .map(equalTo -> equalTo.withChildren(equalTo.children() .stream().map(expr -> exprReplaceMap.get(expr).toSlot()) diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/PushDownFilterThroughJoin.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/PushDownFilterThroughJoin.java index c90723678a..c31c05c8a7 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/PushDownFilterThroughJoin.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/PushDownFilterThroughJoin.java @@ -137,6 +137,7 @@ public class PushDownFilterThroughJoin extends OneRewriteRuleFactory { new LogicalJoin<>(join.getJoinType(), join.getHashJoinConjuncts(), joinConditions, + join.getMarkJoinConjuncts(), join.getDistributeHint(), join.getMarkJoinSlotReference(), PlanUtils.filterOrSelf(leftPredicates, join.left()), diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/PushDownJoinOtherCondition.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/PushDownJoinOtherCondition.java index 3716af8acb..8fff61988d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/PushDownJoinOtherCondition.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/PushDownJoinOtherCondition.java @@ -89,7 +89,8 @@ public class PushDownJoinOtherCondition extends OneRewriteRuleFactory { Plan right = PlanUtils.filterOrSelf(rightConjuncts, join.right()); return new LogicalJoin<>(join.getJoinType(), join.getHashJoinConjuncts(), - remainingOther, join.getDistributeHint(), join.getMarkJoinSlotReference(), left, right); + remainingOther, join.getMarkJoinConjuncts(), join.getDistributeHint(), + join.getMarkJoinSlotReference(), left, right); }).toRule(RuleType.PUSH_DOWN_JOIN_OTHER_CONDITION); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/UnCorrelatedApplyAggregateFilter.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/UnCorrelatedApplyAggregateFilter.java index 15f1b4a554..b0b62f2e9b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/UnCorrelatedApplyAggregateFilter.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/UnCorrelatedApplyAggregateFilter.java @@ -89,7 +89,7 @@ public class UnCorrelatedApplyAggregateFilter extends OneRewriteRuleFactory { ExpressionUtils.optionalAnd(correlatedPredicate), apply.getMarkJoinSlotReference(), apply.isNeedAddSubOutputToProjects(), - apply.isInProject(), apply.left(), newAgg); + apply.isInProject(), apply.isMarkJoinSlotNotNull(), apply.left(), newAgg); }).toRule(RuleType.UN_CORRELATED_APPLY_AGGREGATE_FILTER); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/UnCorrelatedApplyFilter.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/UnCorrelatedApplyFilter.java index af2c83b2b8..30b5cfcef3 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/UnCorrelatedApplyFilter.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/UnCorrelatedApplyFilter.java @@ -69,7 +69,7 @@ public class UnCorrelatedApplyFilter extends OneRewriteRuleFactory { return new LogicalApply<>(apply.getCorrelationSlot(), apply.getSubqueryExpr(), ExpressionUtils.optionalAnd(correlatedPredicate), apply.getMarkJoinSlotReference(), apply.isNeedAddSubOutputToProjects(), - apply.isInProject(), apply.left(), child); + apply.isInProject(), apply.isMarkJoinSlotNotNull(), apply.left(), child); }).toRule(RuleType.UN_CORRELATED_APPLY_FILTER); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/UnCorrelatedApplyProjectFilter.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/UnCorrelatedApplyProjectFilter.java index 911bf0eef0..82950e14dc 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/UnCorrelatedApplyProjectFilter.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/rewrite/UnCorrelatedApplyProjectFilter.java @@ -90,7 +90,7 @@ public class UnCorrelatedApplyProjectFilter extends OneRewriteRuleFactory { return new LogicalApply<>(apply.getCorrelationSlot(), apply.getSubqueryExpr(), ExpressionUtils.optionalAnd(correlatedPredicate), apply.getMarkJoinSlotReference(), apply.isNeedAddSubOutputToProjects(), - apply.isInProject(), apply.left(), newProject); + apply.isInProject(), apply.isMarkJoinSlotNotNull(), apply.left(), newProject); }).toRule(RuleType.UN_CORRELATED_APPLY_PROJECT_FILTER); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java index 3173b654d4..29e30b30f3 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java @@ -60,7 +60,7 @@ public class JoinEstimation { private static boolean hashJoinConditionContainsUnknownColumnStats(Statistics leftStats, Statistics rightStats, Join join) { - for (Expression expr : join.getHashJoinConjuncts()) { + for (Expression expr : join.getEqualPredicates()) { for (Slot slot : expr.getInputSlots()) { ColumnStatistic colStats = leftStats.findColumnStatistics(slot); if (colStats == null) { @@ -87,7 +87,7 @@ public class JoinEstimation { boolean leftBigger = leftStats.getRowCount() > rightStats.getRowCount(); double rightStatsRowCount = StatsMathUtil.nonZeroDivisor(rightStats.getRowCount()); double leftStatsRowCount = StatsMathUtil.nonZeroDivisor(leftStats.getRowCount()); - List trustableConditions = join.getHashJoinConjuncts().stream() + List trustableConditions = join.getEqualPredicates().stream() .map(expression -> (EqualPredicate) expression) .filter( expression -> { @@ -173,7 +173,7 @@ public class JoinEstimation { private static double computeSelectivityForBuildSideWhenColStatsUnknown(Statistics buildStats, Join join) { double sel = 1.0; - for (Expression cond : join.getHashJoinConjuncts()) { + for (Expression cond : join.getEqualPredicates()) { if (cond instanceof EqualTo) { EqualTo equal = (EqualTo) cond; if (equal.left() instanceof Slot && equal.right() instanceof Slot) { @@ -204,7 +204,7 @@ public class JoinEstimation { } Statistics innerJoinStats; - if (join.getHashJoinConjuncts().isEmpty()) { + if (join.getEqualPredicates().isEmpty()) { innerJoinStats = estimateNestLoopJoin(leftStats, rightStats, join); } else { innerJoinStats = estimateHashJoin(leftStats, rightStats, join); @@ -283,7 +283,7 @@ public class JoinEstimation { } } double rowCount = Double.POSITIVE_INFINITY; - for (Expression conjunct : join.getHashJoinConjuncts()) { + for (Expression conjunct : join.getEqualPredicates()) { double eqRowCount = estimateSemiOrAntiRowCountBySlotsEqual(leftStats, rightStats, join, (EqualPredicate) conjunct); if (rowCount > eqRowCount) { @@ -359,7 +359,7 @@ public class JoinEstimation { */ private static Statistics updateJoinResultStatsByHashJoinCondition(Statistics innerStats, Join join) { Map updatedCols = new HashMap<>(); - for (Expression expr : join.getHashJoinConjuncts()) { + for (Expression expr : join.getEqualPredicates()) { EqualPredicate equalTo = (EqualPredicate) expr; ColumnStatistic leftColStats = ExpressionEstimation.estimate(equalTo.left(), innerStats); ColumnStatistic rightColStats = ExpressionEstimation.estimate(equalTo.right(), innerStats); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/copier/LogicalPlanDeepCopier.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/copier/LogicalPlanDeepCopier.java index 6e803e258e..ee4e08ac64 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/copier/LogicalPlanDeepCopier.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/copier/LogicalPlanDeepCopier.java @@ -115,7 +115,8 @@ public class LogicalPlanDeepCopier extends DefaultPlanRewriter markJoinSlotReference = apply.getMarkJoinSlotReference() .map(m -> (MarkJoinSlotReference) ExpressionDeepCopier.INSTANCE.deepCopy(m, context)); return new LogicalApply<>(correlationSlot, subqueryExpr, correlationFilter, - markJoinSlotReference, apply.isNeedAddSubOutputToProjects(), apply.isInProject(), left, right); + markJoinSlotReference, apply.isNeedAddSubOutputToProjects(), apply.isInProject(), + apply.isMarkJoinSlotNotNull(), left, right); } @Override @@ -336,7 +337,10 @@ public class LogicalPlanDeepCopier extends DefaultPlanRewriter hashJoinConjuncts = join.getHashJoinConjuncts().stream() .map(c -> ExpressionDeepCopier.INSTANCE.deepCopy(c, context)) .collect(ImmutableList.toImmutableList()); - return new LogicalJoin<>(join.getJoinType(), hashJoinConjuncts, otherJoinConjuncts, + List markJoinConjuncts = join.getMarkJoinConjuncts().stream() + .map(c -> ExpressionDeepCopier.INSTANCE.deepCopy(c, context)) + .collect(ImmutableList.toImmutableList()); + return new LogicalJoin<>(join.getJoinType(), hashJoinConjuncts, otherJoinConjuncts, markJoinConjuncts, join.getDistributeHint(), join.getMarkJoinSlotReference(), children); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/algebra/Join.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/algebra/Join.java index b892214b97..39788ced17 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/algebra/Join.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/algebra/Join.java @@ -18,6 +18,7 @@ package org.apache.doris.nereids.trees.plans.algebra; import org.apache.doris.nereids.hint.DistributeHint; +import org.apache.doris.nereids.trees.expressions.EqualPredicate; import org.apache.doris.nereids.trees.expressions.EqualTo; import org.apache.doris.nereids.trees.expressions.Expression; import org.apache.doris.nereids.trees.expressions.MarkJoinSlotReference; @@ -28,6 +29,7 @@ import org.apache.doris.nereids.trees.plans.JoinType; import java.util.List; import java.util.Optional; import java.util.stream.Collectors; +import java.util.stream.Stream; /** * Common interface for logical/physical join. @@ -42,25 +44,28 @@ public interface Join { .collect(Collectors.toList()); } + default List getEqualPredicates() { + return Stream.concat(getHashJoinConjuncts().stream(), getMarkJoinConjuncts().stream()) + .filter(EqualPredicate.class::isInstance).map(EqualPredicate.class::cast) + .collect(Collectors.toList()); + } + List getOtherJoinConjuncts(); + List getMarkJoinConjuncts(); + Optional getOnClauseCondition(); DistributeHint getDistributeHint(); boolean isMarkJoin(); + Optional getMarkJoinSlotReference(); + default boolean hasDistributeHint() { return getDistributeHint().distributeType != DistributeType.NONE; } - /** - * The join plan has join condition or not. - */ - default boolean hasJoinCondition() { - return !getHashJoinConjuncts().isEmpty() || !getOtherJoinConjuncts().isEmpty(); - } - default JoinDistributeType getLeftHint() { return JoinDistributeType.NONE; } @@ -78,8 +83,4 @@ public interface Join { return JoinDistributeType.NONE; } } - - default Optional getLeftMarkJoinSlotReference() { - return Optional.empty(); - } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/logical/LogicalApply.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/logical/LogicalApply.java index 04656f0bd1..517048c209 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/logical/LogicalApply.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/logical/LogicalApply.java @@ -62,6 +62,14 @@ public class LogicalApply groupExpression, Optional logicalProperties, List correlationSlot, @@ -69,6 +77,7 @@ public class LogicalApply markJoinSlotReference, boolean needAddSubOutputToProjects, boolean inProject, + boolean isMarkJoinSlotNotNull, LEFT_CHILD_TYPE leftChild, RIGHT_CHILD_TYPE rightChild) { super(PlanType.LOGICAL_APPLY, groupExpression, logicalProperties, leftChild, rightChild); this.correlationSlot = correlationSlot == null ? ImmutableList.of() : ImmutableList.copyOf(correlationSlot); @@ -77,14 +86,15 @@ public class LogicalApply correlationSlot, SubqueryExpr subqueryExpr, Optional correlationFilter, Optional markJoinSlotReference, - boolean needAddSubOutputToProjects, boolean inProject, + boolean needAddSubOutputToProjects, boolean inProject, boolean isMarkJoinSlotNotNull, LEFT_CHILD_TYPE input, RIGHT_CHILD_TYPE subquery) { this(Optional.empty(), Optional.empty(), correlationSlot, subqueryExpr, correlationFilter, - markJoinSlotReference, needAddSubOutputToProjects, inProject, input, + markJoinSlotReference, needAddSubOutputToProjects, inProject, isMarkJoinSlotNotNull, input, subquery); } @@ -136,6 +146,10 @@ public class LogicalApply computeOutput() { return ImmutableList.builder() @@ -153,6 +167,7 @@ public class LogicalApply withSubqueryExprAndChildren(SubqueryExpr subqueryExpr, List children) { return new LogicalApply<>(correlationSlot, subqueryExpr, correlationFilter, - markJoinSlotReference, needAddSubOutputToProjects, inProject, children.get(0), children.get(1)); + markJoinSlotReference, needAddSubOutputToProjects, inProject, isMarkJoinSlotNotNull, + children.get(0), children.get(1)); } @Override public LogicalApply withChildren(List children) { Preconditions.checkArgument(children.size() == 2); return new LogicalApply<>(correlationSlot, subqueryExpr, correlationFilter, - markJoinSlotReference, needAddSubOutputToProjects, inProject, + markJoinSlotReference, needAddSubOutputToProjects, inProject, isMarkJoinSlotNotNull, children.get(0), children.get(1)); } @@ -216,7 +233,7 @@ public class LogicalApply groupExpression) { return new LogicalApply<>(groupExpression, Optional.of(getLogicalProperties()), correlationSlot, subqueryExpr, correlationFilter, markJoinSlotReference, - needAddSubOutputToProjects, inProject, left(), right()); + needAddSubOutputToProjects, inProject, isMarkJoinSlotNotNull, left(), right()); } @Override @@ -225,6 +242,6 @@ public class LogicalApply(groupExpression, logicalProperties, correlationSlot, subqueryExpr, correlationFilter, markJoinSlotReference, - needAddSubOutputToProjects, inProject, children.get(0), children.get(1)); + needAddSubOutputToProjects, inProject, isMarkJoinSlotNotNull, children.get(0), children.get(1)); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/logical/LogicalJoin.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/logical/LogicalJoin.java index ae8db7c97d..d6d183408a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/logical/LogicalJoin.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/logical/LogicalJoin.java @@ -67,6 +67,7 @@ public class LogicalJoin otherJoinConjuncts; private final List hashJoinConjuncts; + private final List markJoinConjuncts; // When the predicate condition contains subqueries and disjunctions, the join will be marked as MarkJoin. private final Optional markJoinSlotReference; @@ -80,114 +81,92 @@ public class LogicalJoin hashJoinConjuncts, LEFT_CHILD_TYPE leftChild, - RIGHT_CHILD_TYPE rightChild) { + public LogicalJoin(JoinType joinType, List hashJoinConjuncts, + LEFT_CHILD_TYPE leftChild, RIGHT_CHILD_TYPE rightChild) { this(joinType, hashJoinConjuncts, ExpressionUtils.EMPTY_CONDITION, - new DistributeHint(DistributeType.NONE), Optional.empty(), - Optional.empty(), Optional.empty(), leftChild, rightChild); + ExpressionUtils.EMPTY_CONDITION, new DistributeHint(DistributeType.NONE), + Optional.empty(), Optional.empty(), Optional.empty(), + ImmutableList.of(leftChild, rightChild), null); } public LogicalJoin(JoinType joinType, List hashJoinConjuncts, List otherJoinConjuncts, LEFT_CHILD_TYPE leftChild, RIGHT_CHILD_TYPE rightChild) { - this(joinType, hashJoinConjuncts, otherJoinConjuncts, + this(joinType, hashJoinConjuncts, otherJoinConjuncts, ExpressionUtils.EMPTY_CONDITION, new DistributeHint(DistributeType.NONE), Optional.empty(), - Optional.empty(), Optional.empty(), leftChild, rightChild); + Optional.empty(), Optional.empty(), ImmutableList.of(leftChild, rightChild), null); } - public LogicalJoin(JoinType joinType, List hashJoinConjuncts, List otherJoinConjuncts, - DistributeHint hint, LEFT_CHILD_TYPE leftChild, RIGHT_CHILD_TYPE rightChild) { - this(joinType, hashJoinConjuncts, otherJoinConjuncts, hint, Optional.empty(), Optional.empty(), - Optional.empty(), leftChild, rightChild); + public LogicalJoin(JoinType joinType, List hashJoinConjuncts, + List otherJoinConjuncts, DistributeHint hint, LEFT_CHILD_TYPE leftChild, + RIGHT_CHILD_TYPE rightChild) { + this(joinType, hashJoinConjuncts, otherJoinConjuncts, ExpressionUtils.EMPTY_CONDITION, hint, + Optional.empty(), Optional.empty(), Optional.empty(), + ImmutableList.of(leftChild, rightChild), null); } - public LogicalJoin( - JoinType joinType, - List hashJoinConjuncts, - List otherJoinConjuncts, - DistributeHint hint, - Optional markJoinSlotReference, - LEFT_CHILD_TYPE leftChild, RIGHT_CHILD_TYPE rightChild) { - this(joinType, hashJoinConjuncts, - otherJoinConjuncts, hint, markJoinSlotReference, - Optional.empty(), Optional.empty(), leftChild, rightChild); + public LogicalJoin(JoinType joinType, List hashJoinConjuncts, + List otherJoinConjuncts, DistributeHint hint, + Optional markJoinSlotReference, LEFT_CHILD_TYPE leftChild, + RIGHT_CHILD_TYPE rightChild) { + this(joinType, hashJoinConjuncts, otherJoinConjuncts, ExpressionUtils.EMPTY_CONDITION, hint, + markJoinSlotReference, Optional.empty(), Optional.empty(), + ImmutableList.of(leftChild, rightChild), null); } - public LogicalJoin( - long bitmap, - JoinType joinType, - List hashJoinConjuncts, - List otherJoinConjuncts, - DistributeHint hint, - Optional markJoinSlotReference, - LEFT_CHILD_TYPE leftChild, RIGHT_CHILD_TYPE rightChild) { - this(joinType, hashJoinConjuncts, - otherJoinConjuncts, hint, markJoinSlotReference, - Optional.empty(), Optional.empty(), leftChild, rightChild); + public LogicalJoin(JoinType joinType, List hashJoinConjuncts, + List otherJoinConjuncts, List markJoinConjuncts, DistributeHint hint, + Optional markJoinSlotReference, LEFT_CHILD_TYPE leftChild, + RIGHT_CHILD_TYPE rightChild) { + this(joinType, hashJoinConjuncts, otherJoinConjuncts, markJoinConjuncts, hint, + markJoinSlotReference, Optional.empty(), Optional.empty(), + ImmutableList.of(leftChild, rightChild), null); + } + + public LogicalJoin(long bitmap, JoinType joinType, List hashJoinConjuncts, + List otherJoinConjuncts, DistributeHint hint, + Optional markJoinSlotReference, LEFT_CHILD_TYPE leftChild, + RIGHT_CHILD_TYPE rightChild) { + this(joinType, hashJoinConjuncts, otherJoinConjuncts, ExpressionUtils.EMPTY_CONDITION, hint, + markJoinSlotReference, Optional.empty(), Optional.empty(), + ImmutableList.of(leftChild, rightChild), null); this.bitmap = LongBitmap.or(this.bitmap, bitmap); } - public LogicalJoin( - JoinType joinType, - List hashJoinConjuncts, - List otherJoinConjuncts, - DistributeHint hint, - Optional markJoinSlotReference, - List children) { - this(joinType, hashJoinConjuncts, - otherJoinConjuncts, hint, markJoinSlotReference, - Optional.empty(), Optional.empty(), children); + public LogicalJoin(JoinType joinType, List hashJoinConjuncts, + List otherJoinConjuncts, DistributeHint hint, + Optional markJoinSlotReference, List children) { + this(joinType, hashJoinConjuncts, otherJoinConjuncts, ExpressionUtils.EMPTY_CONDITION, hint, + markJoinSlotReference, Optional.empty(), Optional.empty(), children, null); } - private LogicalJoin(JoinType joinType, List hashJoinConjuncts, List otherJoinConjuncts, + public LogicalJoin(JoinType joinType, List hashJoinConjuncts, + List otherJoinConjuncts, List markJoinConjuncts, DistributeHint hint, + Optional markJoinSlotReference, List children) { + this(joinType, hashJoinConjuncts, otherJoinConjuncts, markJoinConjuncts, hint, + markJoinSlotReference, Optional.empty(), Optional.empty(), children, null); + } + + private LogicalJoin(JoinType joinType, List hashJoinConjuncts, + List otherJoinConjuncts, List markJoinConjuncts, DistributeHint hint, Optional markJoinSlotReference, - Optional groupExpression, Optional logicalProperties, - List children, JoinReorderContext joinReorderContext) { + Optional groupExpression, + Optional logicalProperties, List children, + JoinReorderContext joinReorderContext) { // Just use in withXXX method. Don't need check/copyOf() super(PlanType.LOGICAL_JOIN, groupExpression, logicalProperties, children); this.joinType = Objects.requireNonNull(joinType, "joinType can not be null"); - this.hashJoinConjuncts = hashJoinConjuncts; - this.otherJoinConjuncts = otherJoinConjuncts; - this.hint = Objects.requireNonNull(hint, "hint can not be null"); - this.joinReorderContext.copyFrom(joinReorderContext); - this.markJoinSlotReference = markJoinSlotReference; - } - - private LogicalJoin( - JoinType joinType, - List hashJoinConjuncts, - List otherJoinConjuncts, - DistributeHint hint, - Optional markJoinSlotReference, - Optional groupExpression, - Optional logicalProperties, - LEFT_CHILD_TYPE leftChild, - RIGHT_CHILD_TYPE rightChild) { - super(PlanType.LOGICAL_JOIN, groupExpression, logicalProperties, leftChild, rightChild); - this.joinType = Objects.requireNonNull(joinType, "joinType can not be null"); - this.hashJoinConjuncts = ImmutableList.copyOf(hashJoinConjuncts); - this.otherJoinConjuncts = ImmutableList.copyOf(otherJoinConjuncts); - this.hint = Objects.requireNonNull(hint, "hint can not be null"); - this.markJoinSlotReference = markJoinSlotReference; - } - - private LogicalJoin( - JoinType joinType, - List hashJoinConjuncts, - List otherJoinConjuncts, - DistributeHint hint, - Optional markJoinSlotReference, - Optional groupExpression, - Optional logicalProperties, - List children) { - super(PlanType.LOGICAL_JOIN, groupExpression, logicalProperties, children); - this.joinType = Objects.requireNonNull(joinType, "joinType can not be null"); this.hashJoinConjuncts = ImmutableList.copyOf(hashJoinConjuncts); this.otherJoinConjuncts = ImmutableList.copyOf(otherJoinConjuncts); + this.markJoinConjuncts = ImmutableList.copyOf(markJoinConjuncts); this.hint = Objects.requireNonNull(hint, "hint can not be null"); + if (joinReorderContext != null) { + this.joinReorderContext.copyFrom(joinReorderContext); + } this.markJoinSlotReference = markJoinSlotReference; } @@ -204,26 +183,56 @@ public class LogicalJoin getConditionSlot() { + // this function is called by rules which reject mark join + // so markJoinConjuncts is not processed here + Preconditions.checkState(!isMarkJoin(), + "shouldn't call mark join's getConditionSlot method"); return Stream.concat(hashJoinConjuncts.stream(), otherJoinConjuncts.stream()) - .flatMap(expr -> expr.getInputSlots().stream()).collect(ImmutableSet.toImmutableSet()); + .flatMap(expr -> expr.getInputSlots().stream()) + .collect(ImmutableSet.toImmutableSet()); } + /** + * getConditionExprId + */ public Set getConditionExprId() { + // this function is called by rules which reject mark join + // so markJoinConjuncts is not processed here + Preconditions.checkState(!isMarkJoin(), + "shouldn't call mark join's getConditionExprId method"); return Stream.concat(getHashJoinConjuncts().stream(), getOtherJoinConjuncts().stream()) .flatMap(expr -> expr.getInputSlotExprIds().stream()).collect(Collectors.toSet()); } + /** + * getLeftConditionSlot + */ public Set getLeftConditionSlot() { + // TODO this function is used by TransposeSemiJoinAgg, we assume it can handle mark join correctly. Set leftOutputSet = this.left().getOutputSet(); - return Stream.concat(hashJoinConjuncts.stream(), otherJoinConjuncts.stream()) - .flatMap(expr -> expr.getInputSlots().stream()) - .filter(leftOutputSet::contains) + return Stream + .concat(Stream.concat(hashJoinConjuncts.stream(), otherJoinConjuncts.stream()), + markJoinConjuncts.stream()) + .flatMap(expr -> expr.getInputSlots().stream()).filter(leftOutputSet::contains) .collect(ImmutableSet.toImmutableSet()); } + /** + * getOnClauseCondition + */ public Optional getOnClauseCondition() { - return ExpressionUtils.optionalAnd(hashJoinConjuncts, otherJoinConjuncts); + // TODO this function is called by AggScalarSubQueryToWindowFunction and InferPredicates + // we assume they can handle mark join correctly + Optional normalJoinConjuncts = + ExpressionUtils.optionalAnd(hashJoinConjuncts, otherJoinConjuncts); + return normalJoinConjuncts.isPresent() + ? ExpressionUtils.optionalAnd(ImmutableList.of(normalJoinConjuncts.get()), + markJoinConjuncts) + : ExpressionUtils.optionalAnd(markJoinConjuncts); } public JoinType getJoinType() { @@ -242,6 +251,10 @@ public class LogicalJoin getMarkJoinConjuncts() { + return markJoinConjuncts; + } + public JoinReorderContext getJoinReorderContext() { return joinReorderContext; } @@ -261,7 +274,8 @@ public class LogicalJoin() .addAll(hashJoinConjuncts) .addAll(otherJoinConjuncts) + .addAll(markJoinConjuncts) .build(); } @@ -328,67 +344,86 @@ public class LogicalJoin withChildren(List children) { Preconditions.checkArgument(children.size() == 2); - return new LogicalJoin<>(joinType, hashJoinConjuncts, otherJoinConjuncts, hint, markJoinSlotReference, - Optional.empty(), Optional.empty(), children, joinReorderContext); + return new LogicalJoin<>(joinType, hashJoinConjuncts, otherJoinConjuncts, markJoinConjuncts, + hint, markJoinSlotReference, Optional.empty(), Optional.empty(), children, + joinReorderContext); } @Override public LogicalJoin withGroupExpression(Optional groupExpression) { - return new LogicalJoin<>(joinType, hashJoinConjuncts, otherJoinConjuncts, hint, markJoinSlotReference, - groupExpression, Optional.of(getLogicalProperties()), children, joinReorderContext); + return new LogicalJoin<>(joinType, hashJoinConjuncts, otherJoinConjuncts, markJoinConjuncts, + hint, markJoinSlotReference, groupExpression, Optional.of(getLogicalProperties()), + children, joinReorderContext); } @Override public Plan withGroupExprLogicalPropChildren(Optional groupExpression, Optional logicalProperties, List children) { Preconditions.checkArgument(children.size() == 2); - return new LogicalJoin<>(joinType, hashJoinConjuncts, otherJoinConjuncts, hint, markJoinSlotReference, - groupExpression, logicalProperties, children, joinReorderContext); + return new LogicalJoin<>(joinType, hashJoinConjuncts, otherJoinConjuncts, markJoinConjuncts, + hint, markJoinSlotReference, groupExpression, logicalProperties, children, + joinReorderContext); } public LogicalJoin withChildrenNoContext(Plan left, Plan right) { - return new LogicalJoin<>(joinType, hashJoinConjuncts, otherJoinConjuncts, hint, - markJoinSlotReference, left, right); + return new LogicalJoin<>(joinType, hashJoinConjuncts, otherJoinConjuncts, markJoinConjuncts, + hint, markJoinSlotReference, Optional.empty(), Optional.empty(), + ImmutableList.of(left, right), null); } - public LogicalJoin withJoinConjuncts( - List hashJoinConjuncts, List otherJoinConjuncts) { - return new LogicalJoin<>(joinType, hashJoinConjuncts, otherJoinConjuncts, - hint, markJoinSlotReference, children); + public LogicalJoin withJoinConjuncts(List hashJoinConjuncts, + List otherJoinConjuncts) { + return new LogicalJoin<>(joinType, hashJoinConjuncts, otherJoinConjuncts, markJoinConjuncts, + hint, markJoinSlotReference, Optional.empty(), Optional.empty(), children, null); + } + + public LogicalJoin withJoinConjuncts(List hashJoinConjuncts, + List otherJoinConjuncts, + List markJoinConjuncts) { + return new LogicalJoin<>(joinType, hashJoinConjuncts, otherJoinConjuncts, markJoinConjuncts, + hint, markJoinSlotReference, Optional.empty(), Optional.empty(), children, null); } public LogicalJoin withHashJoinConjunctsAndChildren( List hashJoinConjuncts, Plan left, Plan right) { Preconditions.checkArgument(children.size() == 2); - return new LogicalJoin<>(joinType, hashJoinConjuncts, otherJoinConjuncts, hint, - markJoinSlotReference, left, right); + return new LogicalJoin<>(joinType, hashJoinConjuncts, otherJoinConjuncts, markJoinConjuncts, + hint, markJoinSlotReference, Optional.empty(), Optional.empty(), + ImmutableList.of(left, right), null); } public LogicalJoin withConjunctsChildren(List hashJoinConjuncts, List otherJoinConjuncts, Plan left, Plan right) { - return new LogicalJoin<>(joinType, hashJoinConjuncts, otherJoinConjuncts, hint, markJoinSlotReference, left, - right); + return new LogicalJoin<>(joinType, hashJoinConjuncts, otherJoinConjuncts, markJoinConjuncts, + hint, markJoinSlotReference, Optional.empty(), Optional.empty(), + ImmutableList.of(left, right), null); + } + + public LogicalJoin withConjunctsChildren(List hashJoinConjuncts, + List otherJoinConjuncts, + List markJoinConjuncts, Plan left, Plan right) { + return new LogicalJoin<>(joinType, hashJoinConjuncts, otherJoinConjuncts, markJoinConjuncts, + hint, markJoinSlotReference, Optional.empty(), Optional.empty(), + ImmutableList.of(left, right), null); } public LogicalJoin withJoinType(JoinType joinType) { - return new LogicalJoin<>(joinType, hashJoinConjuncts, otherJoinConjuncts, hint, - markJoinSlotReference, children); + return new LogicalJoin<>(joinType, hashJoinConjuncts, otherJoinConjuncts, markJoinConjuncts, + hint, markJoinSlotReference, Optional.empty(), Optional.empty(), children, null); } public LogicalJoin withTypeChildren(JoinType joinType, Plan left, Plan right) { - return new LogicalJoin<>(joinType, hashJoinConjuncts, otherJoinConjuncts, hint, - markJoinSlotReference, left, right); - } - - public LogicalJoin withOtherJoinConjuncts(List otherJoinConjuncts) { - return new LogicalJoin<>(joinType, hashJoinConjuncts, otherJoinConjuncts, hint, - markJoinSlotReference, children); + return new LogicalJoin<>(joinType, hashJoinConjuncts, otherJoinConjuncts, markJoinConjuncts, + hint, markJoinSlotReference, Optional.empty(), Optional.empty(), + ImmutableList.of(left, right), null); } /** * extractNullRejectHashKeys */ public @Nullable Pair, Set> extractNullRejectHashKeys() { + // this function is only used by computeFuncDeps, and function dependence calculation is disabled for mark join + // so markJoinConjuncts is not processed now Set leftKeys = new HashSet<>(); Set rightKeys = new HashSet<>(); for (Expression expression : hashJoinConjuncts) { @@ -413,6 +448,10 @@ public class LogicalJoin> outputSupplier) { + if (isMarkJoin()) { + // TODO disable function dependence calculation for mark join, but need re-think this in future. + return FunctionalDependencies.EMPTY_FUNC_DEPS; + } //1. NALAJ and FOJ block functional dependencies if (joinType.isNullAwareLeftAntiJoin() || joinType.isFullOuterJoin()) { return FunctionalDependencies.EMPTY_FUNC_DEPS; @@ -478,6 +517,8 @@ public class LogicalJoin getEqualSlots() { + // this function is only used by EliminateJoinByFK rule, and EliminateJoinByFK is disabled for mark join + // so markJoinConjuncts is not processed now // TODO: Use fd in the future if (!joinType.isInnerJoin() && !joinType.isSemiJoin()) { return ImmutableEqualSet.empty(); @@ -499,6 +540,7 @@ public class LogicalJoin otherJoinConjuncts; private final ImmutableList hashJoinConjuncts; private final DistributeHint hint; - private final Optional markJoinSlotReference; public UsingJoin(JoinType joinType, LEFT_CHILD_TYPE leftChild, RIGHT_CHILD_TYPE rightChild, List expressions, List hashJoinConjuncts, DistributeHint hint) { this(joinType, leftChild, rightChild, expressions, - hashJoinConjuncts, Optional.empty(), Optional.empty(), hint, Optional.empty()); + hashJoinConjuncts, Optional.empty(), Optional.empty(), hint); } /** @@ -63,13 +61,12 @@ public class UsingJoin expressions, List hashJoinConjuncts, Optional groupExpression, Optional logicalProperties, - DistributeHint hint, Optional markJoinSlotReference) { + DistributeHint hint) { super(PlanType.LOGICAL_USING_JOIN, groupExpression, logicalProperties, leftChild, rightChild); this.joinType = joinType; this.otherJoinConjuncts = ImmutableList.copyOf(expressions); this.hashJoinConjuncts = ImmutableList.copyOf(hashJoinConjuncts); this.hint = hint; - this.markJoinSlotReference = markJoinSlotReference; } @Override @@ -114,20 +111,20 @@ public class UsingJoin groupExpression) { return new UsingJoin(joinType, child(0), child(1), otherJoinConjuncts, - hashJoinConjuncts, groupExpression, Optional.of(getLogicalProperties()), hint, markJoinSlotReference); + hashJoinConjuncts, groupExpression, Optional.of(getLogicalProperties()), hint); } @Override public Plan withGroupExprLogicalPropChildren(Optional groupExpression, Optional logicalProperties, List children) { return new UsingJoin(joinType, children.get(0), children.get(1), otherJoinConjuncts, - hashJoinConjuncts, groupExpression, logicalProperties, hint, markJoinSlotReference); + hashJoinConjuncts, groupExpression, logicalProperties, hint); } @Override public Plan withChildren(List children) { return new UsingJoin(joinType, children.get(0), children.get(1), otherJoinConjuncts, - hashJoinConjuncts, groupExpression, Optional.of(getLogicalProperties()), hint, markJoinSlotReference); + hashJoinConjuncts, groupExpression, Optional.of(getLogicalProperties()), hint); } @Override @@ -160,11 +157,15 @@ public class UsingJoin getMarkJoinSlotReference() { - return markJoinSlotReference; + return Optional.empty(); + } + + public List getMarkJoinConjuncts() { + return ExpressionUtils.EMPTY_CONDITION; } @Override @@ -176,9 +177,4 @@ public class UsingJoin hashJoinConjuncts; protected final List otherJoinConjuncts; + protected final List markJoinConjuncts; protected final DistributeHint hint; protected final Optional markJoinSlotReference; protected final List runtimeFilters = Lists.newArrayList(); @@ -81,12 +83,9 @@ public abstract class AbstractPhysicalJoin< Optional markJoinSlotReference, Optional groupExpression, LogicalProperties logicalProperties, LEFT_CHILD_TYPE leftChild, RIGHT_CHILD_TYPE rightChild) { - super(type, groupExpression, logicalProperties, leftChild, rightChild); - this.joinType = Objects.requireNonNull(joinType, "joinType can not be null"); - this.hashJoinConjuncts = ImmutableList.copyOf(hashJoinConjuncts); - this.otherJoinConjuncts = ImmutableList.copyOf(otherJoinConjuncts); - this.hint = Objects.requireNonNull(hint, "hint can not be null"); - this.markJoinSlotReference = markJoinSlotReference; + this(type, joinType, hashJoinConjuncts, otherJoinConjuncts, ExpressionUtils.EMPTY_CONDITION, + hint, markJoinSlotReference, groupExpression, logicalProperties, null, null, + leftChild, rightChild); } /** @@ -105,10 +104,30 @@ public abstract class AbstractPhysicalJoin< Statistics statistics, LEFT_CHILD_TYPE leftChild, RIGHT_CHILD_TYPE rightChild) { + this(type, joinType, hashJoinConjuncts, otherJoinConjuncts, ExpressionUtils.EMPTY_CONDITION, + hint, markJoinSlotReference, groupExpression, logicalProperties, physicalProperties, + statistics, leftChild, rightChild); + } + + protected AbstractPhysicalJoin( + PlanType type, + JoinType joinType, + List hashJoinConjuncts, + List otherJoinConjuncts, + List markJoinConjuncts, + DistributeHint hint, + Optional markJoinSlotReference, + Optional groupExpression, + LogicalProperties logicalProperties, + PhysicalProperties physicalProperties, + Statistics statistics, + LEFT_CHILD_TYPE leftChild, + RIGHT_CHILD_TYPE rightChild) { super(type, groupExpression, logicalProperties, physicalProperties, statistics, leftChild, rightChild); this.joinType = Objects.requireNonNull(joinType, "joinType can not be null"); this.hashJoinConjuncts = ImmutableList.copyOf(hashJoinConjuncts); this.otherJoinConjuncts = ImmutableList.copyOf(otherJoinConjuncts); + this.markJoinConjuncts = ImmutableList.copyOf(markJoinConjuncts); this.hint = hint; this.markJoinSlotReference = markJoinSlotReference; } @@ -137,11 +156,16 @@ public abstract class AbstractPhysicalJoin< return markJoinSlotReference.isPresent(); } + public List getMarkJoinConjuncts() { + return markJoinConjuncts; + } + @Override public List getExpressions() { return new Builder() .addAll(hashJoinConjuncts) - .addAll(otherJoinConjuncts).build(); + .addAll(otherJoinConjuncts) + .addAll(markJoinConjuncts).build(); } // TODO: @@ -158,13 +182,14 @@ public abstract class AbstractPhysicalJoin< return joinType == that.joinType && hashJoinConjuncts.equals(that.hashJoinConjuncts) && otherJoinConjuncts.equals(that.otherJoinConjuncts) + && markJoinConjuncts.equals(that.markJoinConjuncts) && hint.equals(that.hint) && Objects.equals(markJoinSlotReference, that.markJoinSlotReference); } @Override public int hashCode() { - return Objects.hash(joinType, hashJoinConjuncts, otherJoinConjuncts, markJoinSlotReference); + return Objects.hash(joinType, hashJoinConjuncts, otherJoinConjuncts, markJoinConjuncts, markJoinSlotReference); } /** @@ -173,7 +198,14 @@ public abstract class AbstractPhysicalJoin< * @return the combination of hashJoinConjuncts and otherJoinConjuncts */ public Optional getOnClauseCondition() { - return ExpressionUtils.optionalAnd(hashJoinConjuncts, otherJoinConjuncts); + // TODO this function is called by AggScalarSubQueryToWindowFunction and InferPredicates + // we assume they can handle mark join correctly + Optional normalJoinConjuncts = + ExpressionUtils.optionalAnd(hashJoinConjuncts, otherJoinConjuncts); + return normalJoinConjuncts.isPresent() + ? ExpressionUtils.optionalAnd(ImmutableList.of(normalJoinConjuncts.get()), + markJoinConjuncts) + : ExpressionUtils.optionalAnd(markJoinConjuncts); } @Override @@ -200,6 +232,7 @@ public abstract class AbstractPhysicalJoin< properties.put("JoinType", joinType.toString()); properties.put("HashJoinConjuncts", hashJoinConjuncts.toString()); properties.put("OtherJoinConjuncts", otherJoinConjuncts.toString()); + properties.put("MarkJoinConjuncts", markJoinConjuncts.toString()); properties.put("JoinHint", hint.toString()); properties.put("MarkJoinSlotReference", markJoinSlotReference.toString()); physicalJoin.put("Properties", properties); @@ -223,7 +256,14 @@ public abstract class AbstractPhysicalJoin< .build(); } + /** + * getConditionSlot + */ public Set getConditionSlot() { + // this function is called by rules which reject mark join + // so markJoinConjuncts is not processed here + Preconditions.checkState(!isMarkJoin(), + "shouldn't call mark join's getConditionSlot method"); return Stream.concat(hashJoinConjuncts.stream(), otherJoinConjuncts.stream()) .flatMap(expr -> expr.getInputSlots().stream()).collect(ImmutableSet.toImmutableSet()); } @@ -233,7 +273,8 @@ public abstract class AbstractPhysicalJoin< List args = Lists.newArrayList("type", joinType, "stats", statistics, "hashCondition", hashJoinConjuncts, - "otherCondition", otherJoinConjuncts); + "otherCondition", otherJoinConjuncts, + "markCondition", markJoinConjuncts); if (markJoinSlotReference.isPresent()) { args.add("isMarkJoin"); args.add("true"); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalHashJoin.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalHashJoin.java index 58d3530b69..43b82dcc20 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalHashJoin.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalHashJoin.java @@ -35,6 +35,7 @@ import org.apache.doris.nereids.trees.plans.JoinType; import org.apache.doris.nereids.trees.plans.Plan; import org.apache.doris.nereids.trees.plans.PlanType; import org.apache.doris.nereids.trees.plans.visitor.PlanVisitor; +import org.apache.doris.nereids.util.ExpressionUtils; import org.apache.doris.nereids.util.MutableState; import org.apache.doris.planner.RuntimeFilterId; import org.apache.doris.qe.ConnectContext; @@ -86,14 +87,30 @@ public class PhysicalHashJoin< Optional groupExpression, LogicalProperties logicalProperties, LEFT_CHILD_TYPE leftChild, RIGHT_CHILD_TYPE rightChild) { - super(PlanType.PHYSICAL_HASH_JOIN, joinType, hashJoinConjuncts, otherJoinConjuncts, hint, markJoinSlotReference, - groupExpression, logicalProperties, leftChild, rightChild); + this(joinType, hashJoinConjuncts, otherJoinConjuncts, ExpressionUtils.EMPTY_CONDITION, hint, + markJoinSlotReference, groupExpression, logicalProperties, null, null, leftChild, + rightChild); + } + + public PhysicalHashJoin( + JoinType joinType, + List hashJoinConjuncts, + List otherJoinConjuncts, + List markJoinConjuncts, + DistributeHint hint, + Optional markJoinSlotReference, + LogicalProperties logicalProperties, + LEFT_CHILD_TYPE leftChild, + RIGHT_CHILD_TYPE rightChild) { + this(joinType, hashJoinConjuncts, otherJoinConjuncts, markJoinConjuncts, hint, markJoinSlotReference, + Optional.empty(), logicalProperties, null, null, leftChild, rightChild); } private PhysicalHashJoin( JoinType joinType, List hashJoinConjuncts, List otherJoinConjuncts, + List markJoinConjuncts, DistributeHint hint, Optional markJoinSlotReference, Optional groupExpression, @@ -102,8 +119,9 @@ public class PhysicalHashJoin< Statistics statistics, LEFT_CHILD_TYPE leftChild, RIGHT_CHILD_TYPE rightChild) { - super(PlanType.PHYSICAL_HASH_JOIN, joinType, hashJoinConjuncts, otherJoinConjuncts, hint, markJoinSlotReference, - groupExpression, logicalProperties, physicalProperties, statistics, leftChild, rightChild); + super(PlanType.PHYSICAL_HASH_JOIN, joinType, hashJoinConjuncts, otherJoinConjuncts, + markJoinConjuncts, hint, markJoinSlotReference, groupExpression, logicalProperties, + physicalProperties, statistics, leftChild, rightChild); } /** @@ -111,6 +129,12 @@ public class PhysicalHashJoin< * Return pair of left used slots and right used slots. */ public Pair, List> getHashConjunctsExprIds() { + // TODO this function is only called by addShuffleJoinRequestProperty + // currently standalone mark join can only allow broadcast( we can remove this limitation after implement + // something like nullaware shuffle to broadcast nulls to all instances + // mark join with non-empty hash join conjuncts allow shuffle join by hash join conjuncts + Preconditions.checkState(!(isMarkJoin() && hashJoinConjuncts.isEmpty()), + "shouldn't call mark join's getHashConjunctsExprIds method for standalone mark join"); int size = hashJoinConjuncts.size(); List exprIds1 = new ArrayList<>(size); @@ -143,7 +167,7 @@ public class PhysicalHashJoin< public PhysicalHashJoin withChildren(List children) { Preconditions.checkArgument(children.size() == 2); PhysicalHashJoin newJoin = new PhysicalHashJoin<>(joinType, hashJoinConjuncts, - otherJoinConjuncts, hint, markJoinSlotReference, + otherJoinConjuncts, markJoinConjuncts, hint, markJoinSlotReference, Optional.empty(), getLogicalProperties(), physicalProperties, statistics, children.get(0), children.get(1)); if (groupExpression.isPresent()) { @@ -155,28 +179,32 @@ public class PhysicalHashJoin< @Override public PhysicalHashJoin withGroupExpression( Optional groupExpression) { - return new PhysicalHashJoin<>(joinType, hashJoinConjuncts, otherJoinConjuncts, hint, markJoinSlotReference, - groupExpression, getLogicalProperties(), left(), right()); + return new PhysicalHashJoin<>(joinType, hashJoinConjuncts, otherJoinConjuncts, + markJoinConjuncts, hint, markJoinSlotReference, groupExpression, + getLogicalProperties(), null, null, left(), right()); } @Override public Plan withGroupExprLogicalPropChildren(Optional groupExpression, Optional logicalProperties, List children) { Preconditions.checkArgument(children.size() == 2); - return new PhysicalHashJoin<>(joinType, hashJoinConjuncts, otherJoinConjuncts, hint, markJoinSlotReference, - groupExpression, logicalProperties.get(), children.get(0), children.get(1)); + return new PhysicalHashJoin<>(joinType, hashJoinConjuncts, otherJoinConjuncts, + markJoinConjuncts, hint, markJoinSlotReference, groupExpression, + logicalProperties.get(), null, null, children.get(0), children.get(1)); } public PhysicalHashJoin withPhysicalPropertiesAndStats( PhysicalProperties physicalProperties, Statistics statistics) { - return new PhysicalHashJoin<>(joinType, hashJoinConjuncts, otherJoinConjuncts, hint, markJoinSlotReference, - groupExpression, getLogicalProperties(), physicalProperties, statistics, left(), right()); + return new PhysicalHashJoin<>(joinType, hashJoinConjuncts, otherJoinConjuncts, + markJoinConjuncts, hint, markJoinSlotReference, groupExpression, + getLogicalProperties(), physicalProperties, statistics, left(), right()); } @Override public boolean pushDownRuntimeFilter(CascadesContext context, IdGenerator generator, AbstractPhysicalJoin builderNode, Expression srcExpr, Expression probeExpr, TRuntimeFilterType type, long buildSideNdv, int exprOrder) { + // currently, mark join doesn't support RF, so markJoinConjuncts is not processed here if (RuntimeFilterGenerator.DENIED_JOIN_TYPES.contains(getJoinType()) || isMarkJoin()) { if (builderNode instanceof PhysicalHashJoin) { PhysicalHashJoin builderJoin = (PhysicalHashJoin) builderNode; @@ -253,6 +281,10 @@ public class PhysicalHashJoin< .sorted().collect(Collectors.joining(" and ", " hashCondition=(", ")"))); builder.append(otherJoinConjuncts.stream().map(cond -> cond.shapeInfo()) .sorted().collect(Collectors.joining(" and ", " otherCondition=(", ")"))); + if (!markJoinConjuncts.isEmpty()) { + builder.append(markJoinConjuncts.stream().map(cond -> cond.shapeInfo()).sorted() + .collect(Collectors.joining(" and ", " markCondition=(", ")"))); + } if (!runtimeFilters.isEmpty()) { builder.append(" build RFs:").append(runtimeFilters.stream() .map(rf -> rf.shapeInfo()).collect(Collectors.joining(";"))); @@ -262,7 +294,8 @@ public class PhysicalHashJoin< @Override public PhysicalHashJoin resetLogicalProperties() { - return new PhysicalHashJoin<>(joinType, hashJoinConjuncts, otherJoinConjuncts, hint, markJoinSlotReference, - groupExpression, null, physicalProperties, statistics, left(), right()); + return new PhysicalHashJoin<>(joinType, hashJoinConjuncts, otherJoinConjuncts, + markJoinConjuncts, hint, markJoinSlotReference, groupExpression, null, + physicalProperties, statistics, left(), right()); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalNestedLoopJoin.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalNestedLoopJoin.java index 1ad2f2b616..1f4e2bb171 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalNestedLoopJoin.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalNestedLoopJoin.java @@ -29,6 +29,7 @@ import org.apache.doris.nereids.trees.plans.JoinType; import org.apache.doris.nereids.trees.plans.Plan; import org.apache.doris.nereids.trees.plans.PlanType; import org.apache.doris.nereids.trees.plans.visitor.PlanVisitor; +import org.apache.doris.nereids.util.ExpressionUtils; import org.apache.doris.nereids.util.MutableState; import org.apache.doris.statistics.Statistics; @@ -39,6 +40,7 @@ import com.google.common.collect.Sets; import java.util.List; import java.util.Optional; import java.util.Set; +import java.util.stream.Collectors; import java.util.stream.Stream; /** @@ -82,10 +84,8 @@ public class PhysicalNestedLoopJoin< Optional groupExpression, LogicalProperties logicalProperties, LEFT_CHILD_TYPE leftChild, RIGHT_CHILD_TYPE rightChild) { - super(PlanType.PHYSICAL_NESTED_LOOP_JOIN, joinType, hashJoinConjuncts, otherJoinConjuncts, - // nested loop join ignores join hints. - new DistributeHint(DistributeType.NONE), markJoinSlotReference, - groupExpression, logicalProperties, leftChild, rightChild); + this(joinType, hashJoinConjuncts, otherJoinConjuncts, ExpressionUtils.EMPTY_CONDITION, markJoinSlotReference, + groupExpression, logicalProperties, null, null, leftChild, rightChild); } /** @@ -105,7 +105,36 @@ public class PhysicalNestedLoopJoin< Statistics statistics, LEFT_CHILD_TYPE leftChild, RIGHT_CHILD_TYPE rightChild) { - super(PlanType.PHYSICAL_NESTED_LOOP_JOIN, joinType, hashJoinConjuncts, otherJoinConjuncts, + this(joinType, hashJoinConjuncts, otherJoinConjuncts, ExpressionUtils.EMPTY_CONDITION, markJoinSlotReference, + groupExpression, logicalProperties, physicalProperties, statistics, leftChild, rightChild); + } + + public PhysicalNestedLoopJoin( + JoinType joinType, + List hashJoinConjuncts, + List otherJoinConjuncts, + List markJoinConjuncts, + Optional markJoinSlotReference, + LogicalProperties logicalProperties, + LEFT_CHILD_TYPE leftChild, + RIGHT_CHILD_TYPE rightChild) { + this(joinType, hashJoinConjuncts, otherJoinConjuncts, markJoinConjuncts, markJoinSlotReference, + Optional.empty(), logicalProperties, null, null, leftChild, rightChild); + } + + private PhysicalNestedLoopJoin( + JoinType joinType, + List hashJoinConjuncts, + List otherJoinConjuncts, + List markJoinConjuncts, + Optional markJoinSlotReference, + Optional groupExpression, + LogicalProperties logicalProperties, + PhysicalProperties physicalProperties, + Statistics statistics, + LEFT_CHILD_TYPE leftChild, + RIGHT_CHILD_TYPE rightChild) { + super(PlanType.PHYSICAL_NESTED_LOOP_JOIN, joinType, hashJoinConjuncts, otherJoinConjuncts, markJoinConjuncts, // nested loop join ignores join hints. new DistributeHint(DistributeType.NONE), markJoinSlotReference, groupExpression, logicalProperties, physicalProperties, statistics, leftChild, rightChild); @@ -132,7 +161,7 @@ public class PhysicalNestedLoopJoin< public PhysicalNestedLoopJoin withChildren(List children) { Preconditions.checkArgument(children.size() == 2); PhysicalNestedLoopJoin newJoin = new PhysicalNestedLoopJoin<>(joinType, - hashJoinConjuncts, otherJoinConjuncts, markJoinSlotReference, Optional.empty(), + hashJoinConjuncts, otherJoinConjuncts, markJoinConjuncts, markJoinSlotReference, Optional.empty(), getLogicalProperties(), physicalProperties, statistics, children.get(0), children.get(1)); if (groupExpression.isPresent()) { newJoin.setMutableState(MutableState.KEY_GROUP, groupExpression.get().getOwnerGroup().getGroupId().asInt()); @@ -144,8 +173,8 @@ public class PhysicalNestedLoopJoin< public PhysicalNestedLoopJoin withGroupExpression( Optional groupExpression) { return new PhysicalNestedLoopJoin<>(joinType, - hashJoinConjuncts, otherJoinConjuncts, markJoinSlotReference, - groupExpression, getLogicalProperties(), left(), right()); + hashJoinConjuncts, otherJoinConjuncts, markJoinConjuncts, markJoinSlotReference, + groupExpression, getLogicalProperties(), null, null, left(), right()); } @Override @@ -153,15 +182,15 @@ public class PhysicalNestedLoopJoin< Optional logicalProperties, List children) { Preconditions.checkArgument(children.size() == 2); return new PhysicalNestedLoopJoin<>(joinType, - hashJoinConjuncts, otherJoinConjuncts, markJoinSlotReference, groupExpression, - logicalProperties.get(), children.get(0), children.get(1)); + hashJoinConjuncts, otherJoinConjuncts, markJoinConjuncts, markJoinSlotReference, groupExpression, + logicalProperties.get(), null, null, children.get(0), children.get(1)); } @Override public PhysicalNestedLoopJoin withPhysicalPropertiesAndStats( PhysicalProperties physicalProperties, Statistics statistics) { return new PhysicalNestedLoopJoin<>(joinType, - hashJoinConjuncts, otherJoinConjuncts, markJoinSlotReference, groupExpression, + hashJoinConjuncts, otherJoinConjuncts, markJoinConjuncts, markJoinSlotReference, groupExpression, getLogicalProperties(), physicalProperties, statistics, left(), right()); } @@ -177,23 +206,39 @@ public class PhysicalNestedLoopJoin< return bitMapRuntimeFilterConditions.isEmpty(); } + /** + * getConditionSlot + */ public Set getConditionSlot() { + // this function is called by rules which reject mark join + // so markJoinConjuncts is not processed here + Preconditions.checkState(!isMarkJoin(), + "shouldn't call mark join's getConditionSlot method"); return Stream.concat(hashJoinConjuncts.stream(), otherJoinConjuncts.stream()) - .flatMap(expr -> expr.getInputSlots().stream()).collect(ImmutableSet.toImmutableSet()); + .flatMap(expr -> expr.getInputSlots().stream()) + .collect(ImmutableSet.toImmutableSet()); } @Override public String shapeInfo() { StringBuilder builder = new StringBuilder("NestedLoopJoin"); builder.append("[").append(joinType).append("]"); - otherJoinConjuncts.forEach(expr -> builder.append(expr.shapeInfo())); + if (!markJoinConjuncts.isEmpty()) { + builder.append(otherJoinConjuncts.stream().map(cond -> cond.shapeInfo()).sorted() + .collect(Collectors.joining(" and ", " otherCondition=(", ")"))); + builder.append(markJoinConjuncts.stream().map(cond -> cond.shapeInfo()).sorted() + .collect(Collectors.joining(" and ", " markCondition=(", ")"))); + } else { + otherJoinConjuncts.forEach(expr -> builder.append(expr.shapeInfo())); + } + return builder.toString(); } @Override public PhysicalNestedLoopJoin resetLogicalProperties() { return new PhysicalNestedLoopJoin<>(joinType, - hashJoinConjuncts, otherJoinConjuncts, markJoinSlotReference, groupExpression, + hashJoinConjuncts, otherJoinConjuncts, markJoinConjuncts, markJoinSlotReference, groupExpression, null, physicalProperties, statistics, left(), right()); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/util/ExpressionUtils.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/util/ExpressionUtils.java index fd96ceecb9..e78ce76bb8 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/util/ExpressionUtils.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/util/ExpressionUtils.java @@ -34,6 +34,7 @@ import org.apache.doris.nereids.trees.expressions.ExprId; import org.apache.doris.nereids.trees.expressions.Expression; import org.apache.doris.nereids.trees.expressions.InPredicate; import org.apache.doris.nereids.trees.expressions.IsNull; +import org.apache.doris.nereids.trees.expressions.MarkJoinSlotReference; import org.apache.doris.nereids.trees.expressions.NamedExpression; import org.apache.doris.nereids.trees.expressions.Not; import org.apache.doris.nereids.trees.expressions.Or; @@ -51,6 +52,8 @@ import org.apache.doris.nereids.trees.expressions.visitor.DefaultExpressionRewri import org.apache.doris.nereids.trees.expressions.visitor.DefaultExpressionVisitor; import org.apache.doris.nereids.trees.plans.Plan; import org.apache.doris.nereids.trees.plans.visitor.ExpressionLineageReplacer; +import org.apache.doris.nereids.types.BooleanType; +import org.apache.doris.nereids.types.coercion.NumericType; import com.google.common.base.Preconditions; import com.google.common.base.Predicate; @@ -58,6 +61,7 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableList.Builder; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Lists; +import com.google.common.collect.Maps; import com.google.common.collect.Sets; import java.util.Arrays; @@ -184,9 +188,9 @@ public class ExpressionUtils { */ public static Expression combine(Class type, Collection expressions) { /* - * (AB) (CD) E ((AB)(CD)) E (((AB)(CD))E) - * ▲ ▲ ▲ ▲ ▲ ▲ - * │ │ │ │ │ │ + * (AB) (CD) E ((AB)(CD)) E (((AB)(CD))E) + * ▲ ▲ ▲ ▲ ▲ ▲ + * │ │ │ │ │ │ * A B C D E ──► A B C D E ──► (AB) (CD) E ──► ((AB)(CD)) E ──► (((AB)(CD))E) */ Preconditions.checkArgument(type == And.class || type == Or.class); @@ -219,7 +223,8 @@ public class ExpressionUtils { } /** - * Replace the slot in expressions with the lineage identifier from specifiedbaseTable sets or target table types + * Replace the slot in expressions with the lineage identifier from + * specifiedbaseTable sets or target table types * example as following: * select a + 10 as a1, d from ( * select b - 5 as a, d from table @@ -234,11 +239,10 @@ public class ExpressionUtils { if (expressions.isEmpty()) { return ImmutableList.of(); } - ExpressionLineageReplacer.ExpressionReplaceContext replaceContext = - new ExpressionLineageReplacer.ExpressionReplaceContext( - expressions.stream().map(Expression.class::cast).collect(Collectors.toList()), - targetTypes, - tableIdentifiers); + ExpressionLineageReplacer.ExpressionReplaceContext replaceContext = new ExpressionLineageReplacer.ExpressionReplaceContext( + expressions.stream().map(Expression.class::cast).collect(Collectors.toList()), + targetTypes, + tableIdentifiers); plan.accept(ExpressionLineageReplacer.INSTANCE, replaceContext); // Replace expressions by expression map @@ -272,8 +276,10 @@ public class ExpressionUtils { } /** - * Check whether the input expression is a {@link org.apache.doris.nereids.trees.expressions.Slot} - * or at least one {@link Cast} on a {@link org.apache.doris.nereids.trees.expressions.Slot} + * Check whether the input expression is a + * {@link org.apache.doris.nereids.trees.expressions.Slot} + * or at least one {@link Cast} on a + * {@link org.apache.doris.nereids.trees.expressions.Slot} *

* for example: * - SlotReference to a column: @@ -283,7 +289,8 @@ public class ExpressionUtils { * cast(cast(int_col as long) as string) * * @param expr input expression - * @return Return Optional[ExprId] of underlying slot reference if input expression is a slot or cast on slot. + * @return Return Optional[ExprId] of underlying slot reference if input + * expression is a slot or cast on slot. * Otherwise, return empty optional result. */ public static Optional isSlotOrCastOnSlot(Expression expr) { @@ -291,8 +298,10 @@ public class ExpressionUtils { } /** - * Check whether the input expression is a {@link org.apache.doris.nereids.trees.expressions.Slot} - * or at least one {@link Cast} on a {@link org.apache.doris.nereids.trees.expressions.Slot} + * Check whether the input expression is a + * {@link org.apache.doris.nereids.trees.expressions.Slot} + * or at least one {@link Cast} on a + * {@link org.apache.doris.nereids.trees.expressions.Slot} */ public static Optional extractSlotOrCastOnSlot(Expression expr) { while (expr instanceof Cast) { @@ -307,7 +316,8 @@ public class ExpressionUtils { } /** - * Generate replaceMap Slot -> Expression from NamedExpression[Expression as name] + * Generate replaceMap Slot -> Expression from NamedExpression[Expression as + * name] */ public static Map generateReplaceMap(List namedExpressions) { return namedExpressions @@ -317,14 +327,14 @@ public class ExpressionUtils { Collectors.toMap( NamedExpression::toSlot, // Avoid cast to alias, retrieving the first child expression. - alias -> alias.child(0) - ) - ); + alias -> alias.child(0))); } /** - * Replace expression node in the expression tree by `replaceMap` in top-down manner. + * Replace expression node in the expression tree by `replaceMap` in top-down + * manner. * For example. + * *

      * input expression: a > 1
      * replaceMap: a -> b + c
@@ -365,7 +375,8 @@ public class ExpressionUtils {
     }
 
     /**
-     * Replace expression node in the expression tree by `replaceMap` in top-down manner.
+     * Replace expression node in the expression tree by `replaceMap` in top-down
+     * manner.
      */
     public static List replaceNamedExpressions(List namedExpressions,
             Map replaceMap) {
@@ -468,6 +479,75 @@ public class ExpressionUtils {
         return children.stream().allMatch(c -> c instanceof NullLiteral);
     }
 
+    /**
+     * canInferNotNullForMarkSlot
+     */
+    public static boolean canInferNotNullForMarkSlot(Expression predicate) {
+        /*
+         * assume predicate is from LogicalFilter
+         * the idea is replacing each mark join slot with null and false literal then
+         * run FoldConstant rule
+         * if the evaluate result are:
+         * 1. all true
+         * 2. all null and false (in logicalFilter, we discard both null and false
+         * values)
+         * the mark slot can be non-nullable boolean
+         * and in semi join, we can safely change the mark conjunct to hash conjunct
+         */
+        ImmutableList literals = ImmutableList.of(new NullLiteral(BooleanType.INSTANCE), BooleanLiteral.FALSE);
+        List markJoinSlotReferenceList = ((Set) predicate
+                .collect(MarkJoinSlotReference.class::isInstance)).stream()
+                .collect(Collectors.toList());
+        int markSlotSize = markJoinSlotReferenceList.size();
+        int maxMarkSlotCount = 4;
+        // if the conjunct has mark slot, and maximum 4 mark slots(for performance)
+        if (markSlotSize > 0 && markSlotSize <= maxMarkSlotCount) {
+            Map replaceMap = Maps.newHashMap();
+            boolean meetTrue = false;
+            boolean meetNullOrFalse = false;
+            /*
+             * markSlotSize = 1 -> loopCount = 2 ---- 0, 1
+             * markSlotSize = 2 -> loopCount = 4 ---- 00, 01, 10, 11
+             * markSlotSize = 3 -> loopCount = 8 ---- 000, 001, 010, 011, 100, 101, 110, 111
+             * markSlotSize = 4 -> loopCount = 16 ---- 0000, 0001, ... 1111
+             */
+            int loopCount = 2 << markSlotSize;
+            for (int i = 0; i < loopCount; ++i) {
+                replaceMap.clear();
+                /*
+                 * replace each mark slot with null or false
+                 * literals.get(0) -> NullLiteral(BooleanType.INSTANCE)
+                 * literals.get(1) -> BooleanLiteral.FALSE
+                 */
+                for (int j = 0; j < markSlotSize; ++j) {
+                    replaceMap.put(markJoinSlotReferenceList.get(j), literals.get((i >> j) & 1));
+                }
+                Expression evalResult = FoldConstantRule.INSTANCE.rewrite(
+                        ExpressionUtils.replace(predicate, replaceMap),
+                        new ExpressionRewriteContext(null));
+
+                if (evalResult.equals(BooleanLiteral.TRUE)) {
+                    if (meetNullOrFalse) {
+                        return false;
+                    } else {
+                        meetTrue = true;
+                    }
+                } else if ((isNullOrFalse(evalResult))) {
+                    if (meetTrue) {
+                        return false;
+                    } else {
+                        meetNullOrFalse = true;
+                    }
+                }
+            }
+        }
+        return true;
+    }
+
+    private static boolean isNullOrFalse(Expression expression) {
+        return expression.isNullLiteral() || expression.equals(BooleanLiteral.FALSE);
+    }
+
     /**
      * infer notNulls slot from predicate
      */
@@ -502,7 +582,8 @@ public class ExpressionUtils {
     }
 
     /**
-     * infer notNulls slot from predicate but these slots must be in the given slots.
+     * infer notNulls slot from predicate but these slots must be in the given
+     * slots.
      */
     public static Set inferNotNull(Set predicates, Set slots,
             CascadesContext cascadesContext) {
@@ -666,18 +747,18 @@ public class ExpressionUtils {
      */
     public static boolean checkSlotConstant(Slot slot, Set predicates) {
         return predicates.stream().anyMatch(predicate -> {
-                    if (predicate instanceof EqualTo) {
-                        EqualTo equalTo = (EqualTo) predicate;
-                        return (equalTo.left() instanceof Literal && equalTo.right().equals(slot))
-                                || (equalTo.right() instanceof Literal && equalTo.left().equals(slot));
-                    }
-                    return false;
-                }
-        );
+            if (predicate instanceof EqualTo) {
+                EqualTo equalTo = (EqualTo) predicate;
+                return (equalTo.left() instanceof Literal && equalTo.right().equals(slot))
+                        || (equalTo.right() instanceof Literal && equalTo.left().equals(slot));
+            }
+            return false;
+        });
     }
 
     /**
-     * Check the expression is inferred or not, if inferred return true, nor return false
+     * Check the expression is inferred or not, if inferred return true, nor return
+     * false
      */
     public static boolean isInferred(Expression expression) {
         return expression.accept(new DefaultExpressionVisitor() {
diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/util/JoinUtils.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/util/JoinUtils.java
index 9beaa29c43..2712550479 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/nereids/util/JoinUtils.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/util/JoinUtils.java
@@ -58,10 +58,17 @@ import java.util.stream.Collectors;
  * Utils for join
  */
 public class JoinUtils {
+    /**
+     * couldShuffle
+     */
     public static boolean couldShuffle(Join join) {
         // Cross-join and Null-Aware-Left-Anti-Join only can be broadcast join.
-        // Because mark join would consider null value from both build and probe side, so must use broadcast join too.
-        return !(join.getJoinType().isCrossJoin() || join.getJoinType().isNullAwareLeftAntiJoin() || join.isMarkJoin());
+        // standalone mark join would consider null value from both build and probe side, so must use broadcast join.
+        // mark join with hash conjuncts can shuffle by hash conjuncts
+        // TODO actually standalone mark join can use shuffle, but need do nullaware shuffle to broadcast null value
+        //  to all instances
+        return !(join.getJoinType().isCrossJoin() || join.getJoinType().isNullAwareLeftAntiJoin()
+                || (!join.getMarkJoinConjuncts().isEmpty() && join.getHashJoinConjuncts().isEmpty()));
     }
 
     public static boolean couldBroadcast(Join join) {
@@ -173,10 +180,14 @@ public class JoinUtils {
     }
 
     public static boolean shouldNestedLoopJoin(Join join) {
-        return join.getHashJoinConjuncts().isEmpty();
+        // currently, mark join conjuncts only has one conjunct, so we always get the first element here
+        return join.getHashJoinConjuncts().isEmpty() && (join.getMarkJoinConjuncts().isEmpty()
+                || !(join.getMarkJoinConjuncts().get(0) instanceof EqualPredicate));
     }
 
     public static boolean shouldNestedLoopJoin(JoinType joinType, List hashConjuncts) {
+        // this function is only called by hyper graph, which reject mark join
+        // so mark join is not processed here
         return hashConjuncts.isEmpty();
     }
 
diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/HashJoinNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/HashJoinNode.java
index b2ab651eb0..d96d79ebe7 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/planner/HashJoinNode.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/planner/HashJoinNode.java
@@ -76,6 +76,8 @@ public class HashJoinNode extends JoinNodeBase {
     // join conjuncts from the JOIN clause that aren't equi-join predicates
     private List otherJoinConjuncts;
 
+    private List markJoinConjuncts;
+
     private DistributionMode distrMode;
     private boolean isColocate = false; //the flag for colocate join
     private String colocateReason = ""; // if can not do colocate join, set reason here
@@ -173,6 +175,61 @@ public class HashJoinNode extends JoinNodeBase {
         vSrcToOutputSMap = new ExprSubstitutionMap(srcToOutputList, Collections.emptyList());
     }
 
+    public HashJoinNode(PlanNodeId id, PlanNode outer, PlanNode inner, JoinOperator joinOp,
+            List eqJoinConjuncts, List otherJoinConjuncts, List markJoinConjuncts,
+            List srcToOutputList, TupleDescriptor intermediateTuple,
+            TupleDescriptor outputTuple, boolean isMarkJoin) {
+        super(id, "HASH JOIN", StatisticalType.HASH_JOIN_NODE, joinOp, isMarkJoin);
+        Preconditions.checkArgument((eqJoinConjuncts != null && !eqJoinConjuncts.isEmpty())
+                || (markJoinConjuncts != null && !markJoinConjuncts.isEmpty()));
+        Preconditions.checkArgument(otherJoinConjuncts != null);
+        tblRefIds.addAll(outer.getTblRefIds());
+        tblRefIds.addAll(inner.getTblRefIds());
+
+        if (joinOp.equals(JoinOperator.LEFT_ANTI_JOIN) || joinOp.equals(JoinOperator.LEFT_SEMI_JOIN)
+                || joinOp.equals(JoinOperator.NULL_AWARE_LEFT_ANTI_JOIN)) {
+            tupleIds.addAll(outer.getTupleIds());
+        } else if (joinOp.equals(JoinOperator.RIGHT_ANTI_JOIN) || joinOp.equals(JoinOperator.RIGHT_SEMI_JOIN)) {
+            tupleIds.addAll(inner.getTupleIds());
+        } else {
+            tupleIds.addAll(outer.getTupleIds());
+            tupleIds.addAll(inner.getTupleIds());
+        }
+
+        for (Expr eqJoinPredicate : eqJoinConjuncts) {
+            Preconditions.checkArgument(eqJoinPredicate instanceof BinaryPredicate);
+            BinaryPredicate eqJoin = (BinaryPredicate) eqJoinPredicate;
+            if (eqJoin.getOp().equals(BinaryPredicate.Operator.EQ_FOR_NULL)) {
+                Preconditions.checkArgument(eqJoin.getChildren().size() == 2);
+                if (!eqJoin.getChild(0).isNullable() || !eqJoin.getChild(1).isNullable()) {
+                    eqJoin.setOp(BinaryPredicate.Operator.EQ);
+                }
+            }
+            this.eqJoinConjuncts.add(eqJoin);
+        }
+        this.distrMode = DistributionMode.NONE;
+        this.otherJoinConjuncts = otherJoinConjuncts;
+        this.markJoinConjuncts = markJoinConjuncts;
+        children.add(outer);
+        children.add(inner);
+
+        // Inherits all the nullable tuple from the children
+        // Mark tuples that form the "nullable" side of the outer join as nullable.
+        nullableTupleIds.addAll(inner.getNullableTupleIds());
+        nullableTupleIds.addAll(outer.getNullableTupleIds());
+        if (joinOp.equals(JoinOperator.FULL_OUTER_JOIN)) {
+            nullableTupleIds.addAll(outer.getTupleIds());
+            nullableTupleIds.addAll(inner.getTupleIds());
+        } else if (joinOp.equals(JoinOperator.LEFT_OUTER_JOIN)) {
+            nullableTupleIds.addAll(inner.getTupleIds());
+        } else if (joinOp.equals(JoinOperator.RIGHT_OUTER_JOIN)) {
+            nullableTupleIds.addAll(outer.getTupleIds());
+        }
+        vIntermediateTupleDescList = Lists.newArrayList(intermediateTuple);
+        vOutputTupleDesc = outputTuple;
+        vSrcToOutputSMap = new ExprSubstitutionMap(srcToOutputList, Collections.emptyList());
+    }
+
     public List getEqJoinConjuncts() {
         return eqJoinConjuncts;
     }
@@ -717,6 +774,32 @@ public class HashJoinNode extends JoinNodeBase {
             msg.hash_join_node.addToOtherJoinConjuncts(e.treeToThrift());
         }
 
+        if (markJoinConjuncts != null) {
+            if (eqJoinConjuncts.isEmpty()) {
+                Preconditions.checkState(joinOp == JoinOperator.LEFT_SEMI_JOIN
+                        || joinOp == JoinOperator.LEFT_ANTI_JOIN);
+                if (joinOp == JoinOperator.LEFT_SEMI_JOIN) {
+                    msg.hash_join_node.join_op = JoinOperator.NULL_AWARE_LEFT_SEMI_JOIN.toThrift();
+                } else if (joinOp == JoinOperator.LEFT_ANTI_JOIN) {
+                    msg.hash_join_node.join_op = JoinOperator.NULL_AWARE_LEFT_ANTI_JOIN.toThrift();
+                }
+                // because eqJoinConjuncts mustn't be empty in thrift
+                // we have to use markJoinConjuncts instead
+                for (Expr e : markJoinConjuncts) {
+                    Preconditions.checkState(e instanceof BinaryPredicate,
+                            "mark join conjunct must be BinaryPredicate");
+                    TEqJoinCondition eqJoinCondition = new TEqJoinCondition(
+                            e.getChild(0).treeToThrift(), e.getChild(1).treeToThrift());
+                    eqJoinCondition.setOpcode(((BinaryPredicate) e).getOp().getOpcode());
+                    msg.hash_join_node.addToEqJoinConjuncts(eqJoinCondition);
+                }
+            } else {
+                for (Expr e : markJoinConjuncts) {
+                    msg.hash_join_node.addToMarkJoinConjuncts(e.treeToThrift());
+                }
+            }
+        }
+
         if (hashOutputSlotIds != null) {
             for (SlotId slotId : hashOutputSlotIds) {
                 msg.hash_join_node.addToHashOutputSlotIds(slotId.asInt());
@@ -772,6 +855,10 @@ public class HashJoinNode extends JoinNodeBase {
             output.append(detailPrefix).append("other join predicates: ")
                     .append(getExplainString(otherJoinConjuncts)).append("\n");
         }
+        if (markJoinConjuncts != null && !markJoinConjuncts.isEmpty()) {
+            output.append(detailPrefix).append("mark join predicates: ")
+                    .append(getExplainString(markJoinConjuncts)).append("\n");
+        }
         if (!conjuncts.isEmpty()) {
             output.append(detailPrefix).append("other predicates: ").append(getExplainString(conjuncts)).append("\n");
         }
@@ -849,10 +936,18 @@ public class HashJoinNode extends JoinNodeBase {
         this.otherJoinConjuncts = otherJoinConjuncts;
     }
 
+    public void setMarkJoinConjuncts(List markJoinConjuncts) {
+        this.markJoinConjuncts = markJoinConjuncts;
+    }
+
     public List getOtherJoinConjuncts() {
         return otherJoinConjuncts;
     }
 
+    public List getMarkJoinConjuncts() {
+        return markJoinConjuncts;
+    }
+
     SlotRef getMappedInputSlotRef(SlotRef slotRef) {
         if (outputSmap != null) {
             Expr mappedExpr = outputSmap.mappingForRhsExpr(slotRef);
diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/NestedLoopJoinNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/NestedLoopJoinNode.java
index 05eb34a781..63cc724001 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/planner/NestedLoopJoinNode.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/planner/NestedLoopJoinNode.java
@@ -65,6 +65,8 @@ public class NestedLoopJoinNode extends JoinNodeBase {
     private List runtimeFilterExpr = Lists.newArrayList();
     private List joinConjuncts;
 
+    private List markJoinConjuncts;
+
     public NestedLoopJoinNode(PlanNodeId id, PlanNode outer, PlanNode inner, TableRef innerRef) {
         super(id, "NESTED LOOP JOIN", StatisticalType.NESTED_LOOP_JOIN_NODE, outer, inner, innerRef);
         tupleIds.addAll(outer.getOutputTupleIds());
@@ -81,6 +83,10 @@ public class NestedLoopJoinNode extends JoinNodeBase {
         this.joinConjuncts = joinConjuncts;
     }
 
+    public void setMarkJoinConjuncts(List markJoinConjuncts) {
+        this.markJoinConjuncts = markJoinConjuncts;
+    }
+
     @Override
     protected List computeSlotIdsForJoinConjuncts(Analyzer analyzer) {
         // conjunct
@@ -171,6 +177,12 @@ public class NestedLoopJoinNode extends JoinNodeBase {
         for (Expr conjunct : joinConjuncts) {
             msg.nested_loop_join_node.addToJoinConjuncts(conjunct.treeToThrift());
         }
+        if (markJoinConjuncts != null) {
+            for (Expr conjunct : markJoinConjuncts) {
+                msg.nested_loop_join_node.addToMarkJoinConjuncts(conjunct.treeToThrift());
+            }
+        }
+
         msg.nested_loop_join_node.setIsMark(isMarkJoin());
         if (vSrcToOutputSMap != null) {
             for (int i = 0; i < vSrcToOutputSMap.size(); i++) {
@@ -230,6 +242,11 @@ public class NestedLoopJoinNode extends JoinNodeBase {
             output.append(detailPrefix).append("join conjuncts: ").append(getExplainString(joinConjuncts)).append("\n");
         }
 
+        if (markJoinConjuncts != null && !markJoinConjuncts.isEmpty()) {
+            output.append(detailPrefix).append("mark join predicates: ")
+                    .append(getExplainString(markJoinConjuncts)).append("\n");
+        }
+
         if (!conjuncts.isEmpty()) {
             output.append(detailPrefix).append("predicates: ").append(getExplainString(conjuncts)).append("\n");
         }
diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/rules/rewrite/ExistsApplyToJoinTest.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/rules/rewrite/ExistsApplyToJoinTest.java
index eeb7a505f8..5f617eaacc 100644
--- a/fe/fe-core/src/test/java/org/apache/doris/nereids/rules/rewrite/ExistsApplyToJoinTest.java
+++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/rules/rewrite/ExistsApplyToJoinTest.java
@@ -48,7 +48,7 @@ class ExistsApplyToJoinTest implements MemoPatternMatchSupported {
         LogicalApply apply =
                 new LogicalApply<>(ImmutableList.of(leftSlots.get(0), rightSlots.get(0)),
                         exists, Optional.of(equalTo), Optional.empty(),
-                        false, false, left, right);
+                        false, false, false, left, right);
         PlanChecker.from(MemoTestUtils.createConnectContext(), apply)
                 .applyTopDown(new ExistsApplyToJoin())
                 .matchesFromRoot(logicalJoin(
@@ -68,7 +68,7 @@ class ExistsApplyToJoinTest implements MemoPatternMatchSupported {
         LogicalApply apply =
                 new LogicalApply<>(Collections.emptyList(),
                         exists, Optional.of(equalTo), Optional.empty(),
-                        false, false, left, right);
+                        false, false, false, left, right);
         PlanChecker.from(MemoTestUtils.createConnectContext(), apply)
                 .applyTopDown(new ExistsApplyToJoin())
                 .matchesFromRoot(logicalJoin(
@@ -88,7 +88,7 @@ class ExistsApplyToJoinTest implements MemoPatternMatchSupported {
         LogicalApply apply =
                 new LogicalApply<>(Collections.emptyList(),
                         exists, Optional.of(equalTo), Optional.empty(),
-                        false, false, left, right);
+                        false, false, false, left, right);
         PlanChecker.from(MemoTestUtils.createConnectContext(), apply)
                 .applyTopDown(new ExistsApplyToJoin())
                 .matchesFromRoot(logicalFilter(logicalJoin(
@@ -109,7 +109,7 @@ class ExistsApplyToJoinTest implements MemoPatternMatchSupported {
         LogicalApply apply =
                 new LogicalApply<>(ImmutableList.of(leftSlots.get(0), rightSlots.get(0)),
                         exists, Optional.of(equalTo), Optional.empty(),
-                        false, false, left, right);
+                        false, false, false, left, right);
         PlanChecker.from(MemoTestUtils.createConnectContext(), apply)
                 .applyTopDown(new ExistsApplyToJoin())
                 .matchesFromRoot(logicalJoin(
diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/plans/PlanToStringTest.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/plans/PlanToStringTest.java
index a764b9ec34..229b90f00e 100644
--- a/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/plans/PlanToStringTest.java
+++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/trees/plans/PlanToStringTest.java
@@ -75,7 +75,7 @@ public class PlanToStringTest {
                 left, right);
         System.out.println(plan.toString());
         Assertions.assertTrue(plan.toString().matches(
-                "LogicalJoin\\[\\d+\\] \\( type=INNER_JOIN, markJoinSlotReference=Optional.empty, hashJoinConjuncts=\\[\\(a#\\d+ = b#\\d+\\)], otherJoinConjuncts=\\[] \\)"));
+                "LogicalJoin\\[\\d+\\] \\( type=INNER_JOIN, markJoinSlotReference=Optional.empty, hashJoinConjuncts=\\[\\(a#\\d+ = b#\\d+\\)], otherJoinConjuncts=\\[], markJoinConjuncts=\\[] \\)"));
     }
 
     @Test
diff --git a/gensrc/thrift/PlanNodes.thrift b/gensrc/thrift/PlanNodes.thrift
index 2f45f355b1..192cd56c8a 100644
--- a/gensrc/thrift/PlanNodes.thrift
+++ b/gensrc/thrift/PlanNodes.thrift
@@ -729,7 +729,8 @@ enum TJoinOp {
   // on the build side. Those NULLs are considered candidate matches, and therefore could
   // be rejected (ANTI-join), based on the other join conjuncts. This is in contrast
   // to LEFT_ANTI_JOIN where NULLs are not matches and therefore always returned.
-  NULL_AWARE_LEFT_ANTI_JOIN
+  NULL_AWARE_LEFT_ANTI_JOIN,
+  NULL_AWARE_LEFT_SEMI_JOIN
 }
 
 enum TJoinDistributionType {
@@ -772,6 +773,7 @@ struct THashJoinNode {
 
   11: optional bool is_mark
   12: optional TJoinDistributionType dist_type
+  13: optional list mark_join_conjuncts
 }
 
 struct TNestedLoopJoinNode {
@@ -791,6 +793,8 @@ struct TNestedLoopJoinNode {
   7: optional bool is_mark
 
   8: optional list join_conjuncts
+
+  9: optional list mark_join_conjuncts
 }
 
 struct TMergeJoinNode {
diff --git a/regression-test/data/nereids_p0/subquery/subquery_unnesting.out b/regression-test/data/nereids_p0/subquery/subquery_unnesting.out
new file mode 100644
index 0000000000..f809cbf8bc
--- /dev/null
+++ b/regression-test/data/nereids_p0/subquery/subquery_unnesting.out
@@ -0,0 +1,1494 @@
+-- This file is automatically generated. You should know what you did if you want to edit this
+-- !select1 --
+1	\N
+1	2
+1	3
+2	4
+2	5
+3	3
+3	4
+
+-- !select2 --
+1	2
+1	3
+2	4
+3	3
+3	4
+
+-- !select3 --
+1	\N
+1	2
+1	3
+2	4
+2	5
+3	3
+3	4
+20	2
+22	3
+24	4
+
+-- !select4 --
+1	\N
+1	2
+1	3
+2	4
+2	5
+3	3
+3	4
+
+-- !select5 --
+1	\N
+1	2
+1	3
+2	4
+2	5
+3	3
+3	4
+
+-- !select6 --
+2	4
+3	4
+
+-- !select7 --
+1	\N
+1	2
+1	3
+2	4
+2	5
+3	3
+3	4
+
+-- !select8 --
+2	4
+2	5
+3	3
+3	4
+
+-- !select9 --
+1	\N
+1	2
+1	3
+2	4
+2	5
+3	3
+3	4
+
+-- !select10 --
+2	4
+2	5
+3	3
+3	4
+
+-- !select11 --
+\N	\N
+\N	1
+1	\N
+1	2
+1	3
+2	4
+2	5
+3	3
+3	4
+20	2
+22	3
+24	4
+
+-- !select12 --
+\N	\N
+\N	1
+1	\N
+1	2
+1	3
+2	5
+3	3
+20	2
+22	3
+24	4
+
+-- !select13 --
+\N	\N
+1	\N
+1	2
+1	3
+2	4
+2	5
+3	3
+3	4
+20	2
+22	3
+24	4
+
+-- !select14 --
+\N	\N
+1	\N
+1	2
+1	3
+2	4
+2	5
+3	4
+20	2
+22	3
+24	4
+
+-- !select15 --
+1	\N
+1	2
+1	3
+2	4
+2	5
+3	3
+3	4
+
+-- !select16 --
+1	\N
+1	2
+1	3
+20	2
+22	3
+24	4
+
+-- !select17 --
+\N	2
+1	\N
+1	2
+1	3
+2	4
+2	5
+3	3
+3	4
+20	2
+22	3
+24	4
+
+-- !select18 --
+\N	2
+1	2
+1	3
+2	4
+3	3
+3	4
+20	2
+22	3
+24	4
+
+-- !select19 --
+1	\N
+1	2
+1	3
+2	4
+2	5
+3	3
+3	4
+22	3
+24	4
+
+-- !select20 --
+1	3
+2	4
+2	5
+3	3
+3	4
+22	3
+24	4
+
+-- !select21 --
+1	\N
+1	2
+1	3
+2	4
+2	5
+3	3
+3	4
+
+-- !select22 --
+\N	\N
+\N	1
+\N	2
+1	\N
+1	2
+1	3
+2	4
+2	5
+3	3
+3	4
+20	2
+22	3
+24	4
+
+-- !select23 --
+\N	\N
+\N	1
+1	\N
+1	2
+1	3
+2	4
+2	5
+3	3
+3	4
+
+-- !select24 --
+\N	\N
+\N	1
+1	\N
+2	5
+
+-- !select25 --
+\N	\N
+\N	1
+\N	2
+1	\N
+1	2
+1	3
+2	4
+2	5
+3	3
+3	4
+20	2
+
+-- !select26 --
+\N	\N
+1	\N
+
+-- !select27 --
+\N	\N
+\N	1
+\N	2
+1	\N
+1	2
+1	3
+2	4
+2	5
+3	3
+3	4
+20	2
+22	3
+24	4
+
+-- !select28 --
+
+-- !select29 --
+\N	2
+1	2
+1	3
+2	4
+3	3
+3	4
+20	2
+22	3
+24	4
+
+-- !select30 --
+1	3
+2	4
+2	5
+3	3
+3	4
+22	3
+24	4
+
+-- !select31 --
+\N	\N
+\N	1
+\N	2
+1	\N
+1	2
+1	3
+2	4
+2	5
+3	3
+3	4
+20	2
+22	3
+24	4
+
+-- !select32 --
+\N	\N
+\N	1
+\N	2
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	2
+1	2
+1	2
+1	2
+1	2
+1	2
+1	2
+1	2
+1	3
+1	3
+1	3
+1	3
+1	3
+1	3
+1	3
+1	3
+2	4
+2	4
+2	4
+2	4
+2	4
+2	4
+2	4
+2	4
+2	5
+2	5
+2	5
+2	5
+2	5
+2	5
+2	5
+2	5
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	4
+3	4
+3	4
+3	4
+3	4
+3	4
+3	4
+3	4
+20	2
+22	3
+24	4
+
+-- !select33 --
+\N	\N
+\N	1
+\N	2
+1	\N
+1	2
+1	3
+2	4
+2	5
+3	3
+3	4
+20	2
+22	3
+24	4
+
+-- !select34 --
+\N	\N
+\N	1
+\N	2
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	2
+1	2
+1	2
+1	2
+1	2
+1	2
+1	2
+1	2
+1	3
+1	3
+1	3
+1	3
+1	3
+1	3
+1	3
+1	3
+2	4
+2	4
+2	4
+2	4
+2	4
+2	4
+2	4
+2	4
+2	5
+2	5
+2	5
+2	5
+2	5
+2	5
+2	5
+2	5
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	4
+3	4
+3	4
+3	4
+3	4
+3	4
+3	4
+3	4
+20	2
+22	3
+24	4
+
+-- !select35 --
+\N	\N
+\N	1
+\N	2
+1	\N
+1	2
+1	2
+1	3
+1	3
+1	3
+2	4
+2	4
+2	5
+3	3
+3	3
+3	3
+3	4
+3	4
+20	2
+22	3
+24	4
+
+-- !select36 --
+\N	\N
+\N	1
+\N	2
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	2
+1	2
+1	2
+1	2
+1	2
+1	2
+1	2
+1	2
+1	3
+1	3
+1	3
+1	3
+1	3
+1	3
+1	3
+1	3
+2	4
+2	4
+2	4
+2	4
+2	4
+2	4
+2	4
+2	4
+2	5
+2	5
+2	5
+2	5
+2	5
+2	5
+2	5
+2	5
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	4
+3	4
+3	4
+3	4
+3	4
+3	4
+3	4
+3	4
+20	2
+22	3
+24	4
+
+-- !select37 --
+\N	\N
+\N	1
+\N	2
+1	\N
+1	2
+1	2
+1	3
+2	4
+2	4
+2	5
+3	3
+3	4
+20	2
+22	3
+24	4
+
+-- !select38 --
+\N	\N
+\N	1
+\N	2
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	2
+1	2
+1	2
+1	2
+1	2
+1	2
+1	2
+1	2
+1	3
+1	3
+1	3
+1	3
+1	3
+1	3
+1	3
+1	3
+2	4
+2	4
+2	4
+2	4
+2	4
+2	4
+2	4
+2	4
+2	5
+2	5
+2	5
+2	5
+2	5
+2	5
+2	5
+2	5
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	4
+3	4
+3	4
+3	4
+3	4
+3	4
+3	4
+3	4
+20	2
+22	3
+24	4
+
+-- !select39 --
+\N	\N
+\N	1
+\N	2
+1	\N
+1	2
+1	2
+1	3
+1	3
+1	3
+2	4
+2	5
+3	3
+3	3
+3	3
+3	4
+3	4
+20	2
+22	3
+24	4
+
+-- !select40 --
+\N	\N
+\N	1
+\N	2
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	2
+1	2
+1	2
+1	2
+1	2
+1	2
+1	2
+1	2
+1	3
+1	3
+1	3
+1	3
+1	3
+1	3
+1	3
+1	3
+2	4
+2	4
+2	4
+2	4
+2	4
+2	4
+2	4
+2	4
+2	5
+2	5
+2	5
+2	5
+2	5
+2	5
+2	5
+2	5
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	4
+3	4
+3	4
+3	4
+3	4
+3	4
+3	4
+3	4
+20	2
+22	3
+24	4
+
+-- !select41 --
+\N	\N
+\N	1
+\N	2
+1	\N
+1	2
+1	2
+1	3
+1	3
+1	3
+2	4
+2	4
+2	5
+3	3
+3	3
+3	3
+3	4
+3	4
+20	2
+22	3
+24	4
+
+-- !select42 --
+\N	\N
+\N	1
+\N	2
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	2
+1	2
+1	2
+1	2
+1	2
+1	2
+1	2
+1	2
+1	3
+1	3
+1	3
+1	3
+1	3
+1	3
+1	3
+1	3
+2	4
+2	4
+2	4
+2	4
+2	4
+2	4
+2	4
+2	4
+2	5
+2	5
+2	5
+2	5
+2	5
+2	5
+2	5
+2	5
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	4
+3	4
+3	4
+3	4
+3	4
+3	4
+3	4
+3	4
+20	2
+20	2
+22	3
+24	4
+24	4
+
+-- !select43 --
+\N	\N
+\N	1
+\N	2
+1	\N
+1	2
+1	3
+2	4
+2	5
+3	3
+3	4
+3	4
+20	2
+20	2
+22	3
+24	4
+24	4
+
+-- !select44 --
+\N	\N
+\N	1
+\N	2
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	2
+1	2
+1	2
+1	2
+1	2
+1	2
+1	2
+1	2
+1	3
+1	3
+1	3
+1	3
+1	3
+1	3
+1	3
+1	3
+2	4
+2	4
+2	4
+2	4
+2	4
+2	4
+2	4
+2	4
+2	5
+2	5
+2	5
+2	5
+2	5
+2	5
+2	5
+2	5
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	4
+3	4
+3	4
+3	4
+3	4
+3	4
+3	4
+3	4
+20	2
+22	3
+22	3
+22	3
+24	4
+24	4
+
+-- !select45 --
+\N	\N
+\N	1
+\N	2
+1	\N
+1	2
+1	3
+2	4
+2	4
+2	5
+3	3
+3	4
+20	2
+22	3
+22	3
+22	3
+24	4
+24	4
+
+-- !select46 --
+\N	\N
+\N	1
+\N	2
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	2
+1	2
+1	2
+1	2
+1	2
+1	2
+1	2
+1	2
+1	3
+1	3
+1	3
+1	3
+1	3
+1	3
+1	3
+1	3
+2	4
+2	4
+2	4
+2	4
+2	4
+2	4
+2	4
+2	4
+2	5
+2	5
+2	5
+2	5
+2	5
+2	5
+2	5
+2	5
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	4
+3	4
+3	4
+3	4
+3	4
+3	4
+3	4
+3	4
+20	2
+22	3
+24	4
+
+-- !select47 --
+\N	\N
+\N	1
+\N	2
+1	\N
+1	2
+1	3
+2	4
+2	5
+3	3
+3	4
+20	2
+22	3
+24	4
+
+-- !select48 --
+\N	\N
+\N	1
+\N	2
+\N	2
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	2
+1	2
+1	2
+1	2
+1	2
+1	2
+1	2
+1	2
+1	3
+1	3
+1	3
+1	3
+1	3
+1	3
+1	3
+1	3
+2	4
+2	4
+2	4
+2	4
+2	4
+2	4
+2	4
+2	4
+2	5
+2	5
+2	5
+2	5
+2	5
+2	5
+2	5
+2	5
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	4
+3	4
+3	4
+3	4
+3	4
+3	4
+3	4
+3	4
+20	2
+20	2
+22	3
+22	3
+22	3
+24	4
+24	4
+
+-- !select49 --
+\N	\N
+\N	1
+\N	2
+\N	2
+1	\N
+1	2
+1	2
+1	3
+1	3
+1	3
+2	4
+2	4
+2	5
+3	3
+3	3
+3	3
+3	4
+3	4
+20	2
+20	2
+22	3
+22	3
+22	3
+24	4
+24	4
+
+-- !select50 --
+\N	\N
+\N	1
+\N	2
+\N	2
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	2
+1	2
+1	2
+1	2
+1	2
+1	2
+1	2
+1	2
+1	3
+1	3
+1	3
+1	3
+1	3
+1	3
+1	3
+1	3
+2	4
+2	4
+2	4
+2	4
+2	4
+2	4
+2	4
+2	4
+2	5
+2	5
+2	5
+2	5
+2	5
+2	5
+2	5
+2	5
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	4
+3	4
+3	4
+3	4
+3	4
+3	4
+3	4
+3	4
+20	2
+20	2
+22	3
+22	3
+22	3
+24	4
+24	4
+
+-- !select51 --
+\N	\N
+\N	1
+\N	2
+\N	2
+1	\N
+1	2
+1	2
+1	3
+1	3
+1	3
+2	4
+2	4
+2	5
+3	3
+3	3
+3	3
+3	4
+3	4
+20	2
+20	2
+22	3
+22	3
+22	3
+24	4
+24	4
+
+-- !select52 --
+\N	\N
+\N	1
+\N	2
+\N	2
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	2
+1	2
+1	2
+1	2
+1	2
+1	2
+1	2
+1	2
+1	3
+1	3
+1	3
+1	3
+1	3
+1	3
+1	3
+1	3
+2	4
+2	4
+2	4
+2	4
+2	4
+2	4
+2	4
+2	4
+2	5
+2	5
+2	5
+2	5
+2	5
+2	5
+2	5
+2	5
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	4
+3	4
+3	4
+3	4
+3	4
+3	4
+3	4
+3	4
+20	2
+20	2
+22	3
+22	3
+22	3
+24	4
+24	4
+
+-- !select53 --
+\N	\N
+\N	1
+\N	2
+\N	2
+1	\N
+1	2
+1	2
+1	3
+1	3
+1	3
+2	4
+2	4
+2	5
+3	3
+3	3
+3	3
+3	4
+3	4
+20	2
+20	2
+22	3
+22	3
+22	3
+24	4
+24	4
+
+-- !select54 --
+\N	\N
+\N	1
+\N	2
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	2
+1	2
+1	2
+1	2
+1	2
+1	2
+1	2
+1	2
+1	3
+1	3
+1	3
+1	3
+1	3
+1	3
+1	3
+1	3
+2	4
+2	4
+2	4
+2	4
+2	4
+2	4
+2	4
+2	4
+2	5
+2	5
+2	5
+2	5
+2	5
+2	5
+2	5
+2	5
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	4
+3	4
+3	4
+3	4
+3	4
+3	4
+3	4
+3	4
+20	2
+22	3
+24	4
+
+-- !select55 --
+\N	\N
+\N	1
+\N	2
+1	\N
+1	2
+1	3
+2	4
+2	5
+3	3
+3	4
+20	2
+22	3
+24	4
+
+-- !select56 --
+\N	\N
+\N	1
+\N	2
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	2
+1	2
+1	2
+1	2
+1	2
+1	2
+1	2
+1	2
+1	3
+1	3
+1	3
+1	3
+1	3
+1	3
+1	3
+1	3
+2	4
+2	4
+2	4
+2	4
+2	4
+2	4
+2	4
+2	4
+2	5
+2	5
+2	5
+2	5
+2	5
+2	5
+2	5
+2	5
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	4
+3	4
+3	4
+3	4
+3	4
+3	4
+3	4
+3	4
+20	2
+22	3
+24	4
+
+-- !select57 --
+\N	\N
+\N	1
+\N	2
+1	\N
+1	2
+1	3
+2	4
+2	5
+3	3
+3	4
+20	2
+22	3
+24	4
+
+-- !select58 --
+\N	\N
+\N	1
+\N	2
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	\N
+1	2
+1	2
+1	2
+1	2
+1	2
+1	2
+1	2
+1	2
+1	3
+1	3
+1	3
+1	3
+1	3
+1	3
+1	3
+1	3
+2	4
+2	4
+2	4
+2	4
+2	4
+2	4
+2	4
+2	4
+2	5
+2	5
+2	5
+2	5
+2	5
+2	5
+2	5
+2	5
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	3
+3	4
+3	4
+3	4
+3	4
+3	4
+3	4
+3	4
+3	4
+20	2
+22	3
+24	4
+
+-- !select59 --
+\N	\N
+\N	1
+\N	2
+1	\N
+1	2
+1	3
+2	4
+2	5
+3	3
+3	4
+20	2
+22	3
+24	4
+
diff --git a/regression-test/data/nereids_p0/subquery/test_subquery_in_project.out b/regression-test/data/nereids_p0/subquery/test_subquery_in_project.out
index c0d289b51f..961b0d73c5 100644
--- a/regression-test/data/nereids_p0/subquery/test_subquery_in_project.out
+++ b/regression-test/data/nereids_p0/subquery/test_subquery_in_project.out
@@ -66,3 +66,123 @@ true
 5
 7
 
+-- !select_m1 --
+\N
+false
+false
+false
+false
+false
+false
+false
+false
+false
+true
+true
+true
+
+-- !select_m2 --
+false
+false
+false
+false
+false
+false
+true
+true
+true
+true
+true
+true
+true
+
+-- !select_m3 --
+\N
+false
+false
+false
+true
+true
+true
+true
+true
+true
+true
+true
+true
+
+-- !select_m4 --
+false
+false
+false
+false
+false
+false
+false
+true
+true
+true
+true
+true
+true
+
+-- !select_m5 --
+\N
+\N
+\N
+\N
+true
+true
+true
+true
+true
+true
+true
+true
+true
+
+-- !select_m6 --
+true
+true
+true
+true
+true
+true
+true
+true
+true
+true
+true
+true
+true
+
+-- !select_m7 --
+\N
+\N
+\N
+\N
+false
+false
+false
+false
+false
+false
+false
+false
+false
+
+-- !select_m8 --
+false
+false
+false
+false
+false
+false
+false
+false
+false
+false
+false
+false
+false
+
diff --git a/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query10.out b/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query10.out
index 59e87830c0..83741371af 100644
--- a/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query10.out
+++ b/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query10.out
@@ -5,50 +5,53 @@ PhysicalResultSink
 ----PhysicalTopN[MERGE_SORT]
 ------PhysicalDistribute[DistributionSpecGather]
 --------PhysicalTopN[LOCAL_SORT]
-----------hashAgg[LOCAL]
-------------PhysicalProject
---------------filter((ifnull($c$1, FALSE) OR ifnull($c$2, FALSE)))
-----------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((c.c_customer_sk = catalog_sales.cs_ship_customer_sk)) otherCondition=()
-------------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((c.c_customer_sk = web_sales.ws_bill_customer_sk)) otherCondition=()
---------------------PhysicalProject
-----------------------hashJoin[INNER_JOIN] hashCondition=((customer_demographics.cd_demo_sk = c.c_current_cdemo_sk)) otherCondition=() build RFs:RF5 c_current_cdemo_sk->[cd_demo_sk]
-------------------------PhysicalOlapScan[customer_demographics] apply RFs: RF5
-------------------------PhysicalDistribute[DistributionSpecReplicated]
+----------hashAgg[GLOBAL]
+------------PhysicalDistribute[DistributionSpecHash]
+--------------hashAgg[LOCAL]
+----------------PhysicalProject
+------------------filter((ifnull($c$1, FALSE) OR ifnull($c$2, FALSE)))
+--------------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((c.c_customer_sk = catalog_sales.cs_ship_customer_sk)) otherCondition=()
+----------------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((c.c_customer_sk = web_sales.ws_bill_customer_sk)) otherCondition=()
+------------------------PhysicalDistribute[DistributionSpecHash]
 --------------------------PhysicalProject
-----------------------------hashJoin[RIGHT_SEMI_JOIN] hashCondition=((c.c_customer_sk = store_sales.ss_customer_sk)) otherCondition=() build RFs:RF4 c_customer_sk->[ss_customer_sk]
-------------------------------PhysicalDistribute[DistributionSpecHash]
+----------------------------hashJoin[INNER_JOIN] hashCondition=((customer_demographics.cd_demo_sk = c.c_current_cdemo_sk)) otherCondition=() build RFs:RF5 c_current_cdemo_sk->[cd_demo_sk]
+------------------------------PhysicalOlapScan[customer_demographics] apply RFs: RF5
+------------------------------PhysicalDistribute[DistributionSpecReplicated]
 --------------------------------PhysicalProject
-----------------------------------hashJoin[INNER_JOIN] hashCondition=((store_sales.ss_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF3 d_date_sk->[ss_sold_date_sk]
-------------------------------------PhysicalProject
---------------------------------------PhysicalOlapScan[store_sales] apply RFs: RF3 RF4
-------------------------------------PhysicalDistribute[DistributionSpecReplicated]
+----------------------------------hashJoin[RIGHT_SEMI_JOIN] hashCondition=((c.c_customer_sk = store_sales.ss_customer_sk)) otherCondition=() build RFs:RF4 c_customer_sk->[ss_customer_sk]
+------------------------------------PhysicalDistribute[DistributionSpecHash]
 --------------------------------------PhysicalProject
-----------------------------------------filter((date_dim.d_moy <= 6) and (date_dim.d_moy >= 3) and (date_dim.d_year = 2001))
-------------------------------------------PhysicalOlapScan[date_dim]
-------------------------------PhysicalDistribute[DistributionSpecHash]
---------------------------------hashJoin[INNER_JOIN] hashCondition=((c.c_current_addr_sk = ca.ca_address_sk)) otherCondition=() build RFs:RF2 ca_address_sk->[c_current_addr_sk]
-----------------------------------PhysicalProject
-------------------------------------PhysicalOlapScan[customer] apply RFs: RF2
-----------------------------------PhysicalDistribute[DistributionSpecReplicated]
-------------------------------------PhysicalProject
---------------------------------------filter(ca_county IN ('Campbell County', 'Cleburne County', 'Escambia County', 'Fairfield County', 'Washtenaw County'))
-----------------------------------------PhysicalOlapScan[customer_address]
---------------------PhysicalDistribute[DistributionSpecReplicated]
-----------------------PhysicalProject
-------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF1 d_date_sk->[ws_sold_date_sk]
+----------------------------------------hashJoin[INNER_JOIN] hashCondition=((store_sales.ss_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF3 d_date_sk->[ss_sold_date_sk]
+------------------------------------------PhysicalProject
+--------------------------------------------PhysicalOlapScan[store_sales] apply RFs: RF3 RF4
+------------------------------------------PhysicalDistribute[DistributionSpecReplicated]
+--------------------------------------------PhysicalProject
+----------------------------------------------filter((date_dim.d_moy <= 6) and (date_dim.d_moy >= 3) and (date_dim.d_year = 2001))
+------------------------------------------------PhysicalOlapScan[date_dim]
+------------------------------------PhysicalDistribute[DistributionSpecHash]
+--------------------------------------hashJoin[INNER_JOIN] hashCondition=((c.c_current_addr_sk = ca.ca_address_sk)) otherCondition=() build RFs:RF2 ca_address_sk->[c_current_addr_sk]
+----------------------------------------PhysicalProject
+------------------------------------------PhysicalOlapScan[customer] apply RFs: RF2
+----------------------------------------PhysicalDistribute[DistributionSpecReplicated]
+------------------------------------------PhysicalProject
+--------------------------------------------filter(ca_county IN ('Campbell County', 'Cleburne County', 'Escambia County', 'Fairfield County', 'Washtenaw County'))
+----------------------------------------------PhysicalOlapScan[customer_address]
+------------------------PhysicalDistribute[DistributionSpecHash]
 --------------------------PhysicalProject
-----------------------------PhysicalOlapScan[web_sales] apply RFs: RF1
---------------------------PhysicalDistribute[DistributionSpecReplicated]
-----------------------------PhysicalProject
-------------------------------filter((date_dim.d_moy <= 6) and (date_dim.d_moy >= 3) and (date_dim.d_year = 2001))
---------------------------------PhysicalOlapScan[date_dim]
-------------------PhysicalDistribute[DistributionSpecReplicated]
---------------------PhysicalProject
-----------------------hashJoin[INNER_JOIN] hashCondition=((catalog_sales.cs_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF0 d_date_sk->[cs_sold_date_sk]
+----------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF1 d_date_sk->[ws_sold_date_sk]
+------------------------------PhysicalProject
+--------------------------------PhysicalOlapScan[web_sales] apply RFs: RF1
+------------------------------PhysicalDistribute[DistributionSpecReplicated]
+--------------------------------PhysicalProject
+----------------------------------filter((date_dim.d_moy <= 6) and (date_dim.d_moy >= 3) and (date_dim.d_year = 2001))
+------------------------------------PhysicalOlapScan[date_dim]
+----------------------PhysicalDistribute[DistributionSpecHash]
 ------------------------PhysicalProject
---------------------------PhysicalOlapScan[catalog_sales] apply RFs: RF0
-------------------------PhysicalDistribute[DistributionSpecReplicated]
---------------------------PhysicalProject
-----------------------------filter((date_dim.d_moy <= 6) and (date_dim.d_moy >= 3) and (date_dim.d_year = 2001))
-------------------------------PhysicalOlapScan[date_dim]
+--------------------------hashJoin[INNER_JOIN] hashCondition=((catalog_sales.cs_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF0 d_date_sk->[cs_sold_date_sk]
+----------------------------PhysicalProject
+------------------------------PhysicalOlapScan[catalog_sales] apply RFs: RF0
+----------------------------PhysicalDistribute[DistributionSpecReplicated]
+------------------------------PhysicalProject
+--------------------------------filter((date_dim.d_moy <= 6) and (date_dim.d_moy >= 3) and (date_dim.d_year = 2001))
+----------------------------------PhysicalOlapScan[date_dim]
 
diff --git a/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query35.out b/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query35.out
index 79412a9b42..a72d3f8673 100644
--- a/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query35.out
+++ b/regression-test/data/nereids_tpcds_shape_sf1000_p0/shape/query35.out
@@ -12,32 +12,33 @@ PhysicalResultSink
 ------------------filter((ifnull($c$1, FALSE) OR ifnull($c$2, FALSE)))
 --------------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((c.c_customer_sk = catalog_sales.cs_ship_customer_sk)) otherCondition=()
 ----------------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((c.c_customer_sk = web_sales.ws_bill_customer_sk)) otherCondition=()
-------------------------PhysicalProject
---------------------------hashJoin[INNER_JOIN] hashCondition=((customer_demographics.cd_demo_sk = c.c_current_cdemo_sk)) otherCondition=() build RFs:RF5 cd_demo_sk->[c_current_cdemo_sk]
-----------------------------PhysicalDistribute[DistributionSpecHash]
-------------------------------PhysicalProject
---------------------------------hashJoin[INNER_JOIN] hashCondition=((c.c_current_addr_sk = ca.ca_address_sk)) otherCondition=() build RFs:RF4 ca_address_sk->[c_current_addr_sk]
-----------------------------------PhysicalDistribute[DistributionSpecHash]
-------------------------------------hashJoin[RIGHT_SEMI_JOIN] hashCondition=((c.c_customer_sk = store_sales.ss_customer_sk)) otherCondition=() build RFs:RF3 c_customer_sk->[ss_customer_sk]
---------------------------------------PhysicalDistribute[DistributionSpecHash]
-----------------------------------------PhysicalProject
-------------------------------------------hashJoin[INNER_JOIN] hashCondition=((store_sales.ss_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF2 d_date_sk->[ss_sold_date_sk]
---------------------------------------------PhysicalProject
-----------------------------------------------PhysicalOlapScan[store_sales] apply RFs: RF2 RF3
---------------------------------------------PhysicalDistribute[DistributionSpecReplicated]
+------------------------PhysicalDistribute[DistributionSpecHash]
+--------------------------PhysicalProject
+----------------------------hashJoin[INNER_JOIN] hashCondition=((customer_demographics.cd_demo_sk = c.c_current_cdemo_sk)) otherCondition=() build RFs:RF5 cd_demo_sk->[c_current_cdemo_sk]
+------------------------------PhysicalDistribute[DistributionSpecHash]
+--------------------------------PhysicalProject
+----------------------------------hashJoin[INNER_JOIN] hashCondition=((c.c_current_addr_sk = ca.ca_address_sk)) otherCondition=() build RFs:RF4 ca_address_sk->[c_current_addr_sk]
+------------------------------------PhysicalDistribute[DistributionSpecHash]
+--------------------------------------hashJoin[RIGHT_SEMI_JOIN] hashCondition=((c.c_customer_sk = store_sales.ss_customer_sk)) otherCondition=() build RFs:RF3 c_customer_sk->[ss_customer_sk]
+----------------------------------------PhysicalDistribute[DistributionSpecHash]
+------------------------------------------PhysicalProject
+--------------------------------------------hashJoin[INNER_JOIN] hashCondition=((store_sales.ss_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF2 d_date_sk->[ss_sold_date_sk]
 ----------------------------------------------PhysicalProject
-------------------------------------------------filter((date_dim.d_qoy < 4) and (date_dim.d_year = 1999))
---------------------------------------------------PhysicalOlapScan[date_dim]
---------------------------------------PhysicalDistribute[DistributionSpecHash]
-----------------------------------------PhysicalProject
-------------------------------------------PhysicalOlapScan[customer] apply RFs: RF4 RF5
-----------------------------------PhysicalDistribute[DistributionSpecHash]
-------------------------------------PhysicalProject
---------------------------------------PhysicalOlapScan[customer_address]
-----------------------------PhysicalDistribute[DistributionSpecHash]
-------------------------------PhysicalProject
---------------------------------PhysicalOlapScan[customer_demographics]
-------------------------PhysicalDistribute[DistributionSpecReplicated]
+------------------------------------------------PhysicalOlapScan[store_sales] apply RFs: RF2 RF3
+----------------------------------------------PhysicalDistribute[DistributionSpecReplicated]
+------------------------------------------------PhysicalProject
+--------------------------------------------------filter((date_dim.d_qoy < 4) and (date_dim.d_year = 1999))
+----------------------------------------------------PhysicalOlapScan[date_dim]
+----------------------------------------PhysicalDistribute[DistributionSpecHash]
+------------------------------------------PhysicalProject
+--------------------------------------------PhysicalOlapScan[customer] apply RFs: RF4 RF5
+------------------------------------PhysicalDistribute[DistributionSpecHash]
+--------------------------------------PhysicalProject
+----------------------------------------PhysicalOlapScan[customer_address]
+------------------------------PhysicalDistribute[DistributionSpecHash]
+--------------------------------PhysicalProject
+----------------------------------PhysicalOlapScan[customer_demographics]
+------------------------PhysicalDistribute[DistributionSpecHash]
 --------------------------PhysicalProject
 ----------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF1 d_date_sk->[ws_sold_date_sk]
 ------------------------------PhysicalProject
@@ -46,7 +47,7 @@ PhysicalResultSink
 --------------------------------PhysicalProject
 ----------------------------------filter((date_dim.d_qoy < 4) and (date_dim.d_year = 1999))
 ------------------------------------PhysicalOlapScan[date_dim]
-----------------------PhysicalDistribute[DistributionSpecReplicated]
+----------------------PhysicalDistribute[DistributionSpecHash]
 ------------------------PhysicalProject
 --------------------------hashJoin[INNER_JOIN] hashCondition=((catalog_sales.cs_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF0 d_date_sk->[cs_sold_date_sk]
 ----------------------------PhysicalProject
diff --git a/regression-test/data/nereids_tpcds_shape_sf100_p0/noStatsRfPrune/query10.out b/regression-test/data/nereids_tpcds_shape_sf100_p0/noStatsRfPrune/query10.out
index 49d24a004d..845705681d 100644
--- a/regression-test/data/nereids_tpcds_shape_sf100_p0/noStatsRfPrune/query10.out
+++ b/regression-test/data/nereids_tpcds_shape_sf100_p0/noStatsRfPrune/query10.out
@@ -12,30 +12,31 @@ PhysicalResultSink
 ------------------filter((ifnull($c$1, FALSE) OR ifnull($c$2, FALSE)))
 --------------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((c.c_customer_sk = catalog_sales.cs_ship_customer_sk)) otherCondition=()
 ----------------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((c.c_customer_sk = web_sales.ws_bill_customer_sk)) otherCondition=()
-------------------------PhysicalProject
---------------------------hashJoin[INNER_JOIN] hashCondition=((c.c_current_addr_sk = ca.ca_address_sk)) otherCondition=() build RFs:RF5 ca_address_sk->[c_current_addr_sk]
-----------------------------hashJoin[INNER_JOIN] hashCondition=((customer_demographics.cd_demo_sk = c.c_current_cdemo_sk)) otherCondition=()
-------------------------------PhysicalDistribute[DistributionSpecHash]
---------------------------------hashJoin[RIGHT_SEMI_JOIN] hashCondition=((c.c_customer_sk = store_sales.ss_customer_sk)) otherCondition=() build RFs:RF3 c_customer_sk->[ss_customer_sk]
-----------------------------------PhysicalDistribute[DistributionSpecHash]
-------------------------------------PhysicalProject
---------------------------------------hashJoin[INNER_JOIN] hashCondition=((store_sales.ss_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF2 d_date_sk->[ss_sold_date_sk]
-----------------------------------------PhysicalProject
-------------------------------------------PhysicalOlapScan[store_sales] apply RFs: RF2 RF3
-----------------------------------------PhysicalDistribute[DistributionSpecReplicated]
+------------------------PhysicalDistribute[DistributionSpecHash]
+--------------------------PhysicalProject
+----------------------------hashJoin[INNER_JOIN] hashCondition=((c.c_current_addr_sk = ca.ca_address_sk)) otherCondition=() build RFs:RF5 ca_address_sk->[c_current_addr_sk]
+------------------------------hashJoin[INNER_JOIN] hashCondition=((customer_demographics.cd_demo_sk = c.c_current_cdemo_sk)) otherCondition=()
+--------------------------------PhysicalDistribute[DistributionSpecHash]
+----------------------------------hashJoin[RIGHT_SEMI_JOIN] hashCondition=((c.c_customer_sk = store_sales.ss_customer_sk)) otherCondition=() build RFs:RF3 c_customer_sk->[ss_customer_sk]
+------------------------------------PhysicalDistribute[DistributionSpecHash]
+--------------------------------------PhysicalProject
+----------------------------------------hashJoin[INNER_JOIN] hashCondition=((store_sales.ss_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF2 d_date_sk->[ss_sold_date_sk]
 ------------------------------------------PhysicalProject
---------------------------------------------filter((date_dim.d_moy <= 4) and (date_dim.d_moy >= 1) and (date_dim.d_year = 2001))
-----------------------------------------------PhysicalOlapScan[date_dim]
-----------------------------------PhysicalDistribute[DistributionSpecHash]
-------------------------------------PhysicalProject
---------------------------------------PhysicalOlapScan[customer] apply RFs: RF5
-------------------------------PhysicalDistribute[DistributionSpecHash]
---------------------------------PhysicalOlapScan[customer_demographics]
-----------------------------PhysicalDistribute[DistributionSpecReplicated]
-------------------------------PhysicalProject
---------------------------------filter(ca_county IN ('Cochran County', 'Kandiyohi County', 'Marquette County', 'Storey County', 'Warren County'))
-----------------------------------PhysicalOlapScan[customer_address]
-------------------------PhysicalDistribute[DistributionSpecReplicated]
+--------------------------------------------PhysicalOlapScan[store_sales] apply RFs: RF2 RF3
+------------------------------------------PhysicalDistribute[DistributionSpecReplicated]
+--------------------------------------------PhysicalProject
+----------------------------------------------filter((date_dim.d_moy <= 4) and (date_dim.d_moy >= 1) and (date_dim.d_year = 2001))
+------------------------------------------------PhysicalOlapScan[date_dim]
+------------------------------------PhysicalDistribute[DistributionSpecHash]
+--------------------------------------PhysicalProject
+----------------------------------------PhysicalOlapScan[customer] apply RFs: RF5
+--------------------------------PhysicalDistribute[DistributionSpecHash]
+----------------------------------PhysicalOlapScan[customer_demographics]
+------------------------------PhysicalDistribute[DistributionSpecReplicated]
+--------------------------------PhysicalProject
+----------------------------------filter(ca_county IN ('Cochran County', 'Kandiyohi County', 'Marquette County', 'Storey County', 'Warren County'))
+------------------------------------PhysicalOlapScan[customer_address]
+------------------------PhysicalDistribute[DistributionSpecHash]
 --------------------------PhysicalProject
 ----------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF1 d_date_sk->[ws_sold_date_sk]
 ------------------------------PhysicalProject
@@ -44,7 +45,7 @@ PhysicalResultSink
 --------------------------------PhysicalProject
 ----------------------------------filter((date_dim.d_moy <= 4) and (date_dim.d_moy >= 1) and (date_dim.d_year = 2001))
 ------------------------------------PhysicalOlapScan[date_dim]
-----------------------PhysicalDistribute[DistributionSpecReplicated]
+----------------------PhysicalDistribute[DistributionSpecHash]
 ------------------------PhysicalProject
 --------------------------hashJoin[INNER_JOIN] hashCondition=((catalog_sales.cs_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF0 d_date_sk->[cs_sold_date_sk]
 ----------------------------PhysicalProject
diff --git a/regression-test/data/nereids_tpcds_shape_sf100_p0/noStatsRfPrune/query35.out b/regression-test/data/nereids_tpcds_shape_sf100_p0/noStatsRfPrune/query35.out
index a849eb675a..97a42e0930 100644
--- a/regression-test/data/nereids_tpcds_shape_sf100_p0/noStatsRfPrune/query35.out
+++ b/regression-test/data/nereids_tpcds_shape_sf100_p0/noStatsRfPrune/query35.out
@@ -12,31 +12,32 @@ PhysicalResultSink
 ------------------filter((ifnull($c$1, FALSE) OR ifnull($c$2, FALSE)))
 --------------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((c.c_customer_sk = catalog_sales.cs_ship_customer_sk)) otherCondition=()
 ----------------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((c.c_customer_sk = web_sales.ws_bill_customer_sk)) otherCondition=()
-------------------------PhysicalProject
---------------------------hashJoin[INNER_JOIN] hashCondition=((c.c_current_addr_sk = ca.ca_address_sk)) otherCondition=()
-----------------------------PhysicalDistribute[DistributionSpecHash]
-------------------------------hashJoin[INNER_JOIN] hashCondition=((customer_demographics.cd_demo_sk = c.c_current_cdemo_sk)) otherCondition=()
---------------------------------PhysicalDistribute[DistributionSpecHash]
-----------------------------------hashJoin[RIGHT_SEMI_JOIN] hashCondition=((c.c_customer_sk = store_sales.ss_customer_sk)) otherCondition=()
-------------------------------------PhysicalDistribute[DistributionSpecHash]
---------------------------------------PhysicalProject
-----------------------------------------hashJoin[INNER_JOIN] hashCondition=((store_sales.ss_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF2 d_date_sk->[ss_sold_date_sk]
-------------------------------------------PhysicalProject
---------------------------------------------PhysicalOlapScan[store_sales] apply RFs: RF2
-------------------------------------------PhysicalDistribute[DistributionSpecReplicated]
+------------------------PhysicalDistribute[DistributionSpecHash]
+--------------------------PhysicalProject
+----------------------------hashJoin[INNER_JOIN] hashCondition=((c.c_current_addr_sk = ca.ca_address_sk)) otherCondition=()
+------------------------------PhysicalDistribute[DistributionSpecHash]
+--------------------------------hashJoin[INNER_JOIN] hashCondition=((customer_demographics.cd_demo_sk = c.c_current_cdemo_sk)) otherCondition=()
+----------------------------------PhysicalDistribute[DistributionSpecHash]
+------------------------------------hashJoin[RIGHT_SEMI_JOIN] hashCondition=((c.c_customer_sk = store_sales.ss_customer_sk)) otherCondition=()
+--------------------------------------PhysicalDistribute[DistributionSpecHash]
+----------------------------------------PhysicalProject
+------------------------------------------hashJoin[INNER_JOIN] hashCondition=((store_sales.ss_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF2 d_date_sk->[ss_sold_date_sk]
 --------------------------------------------PhysicalProject
-----------------------------------------------filter((date_dim.d_qoy < 4) and (date_dim.d_year = 2001))
-------------------------------------------------PhysicalOlapScan[date_dim]
-------------------------------------PhysicalDistribute[DistributionSpecHash]
---------------------------------------PhysicalProject
-----------------------------------------PhysicalOlapScan[customer]
---------------------------------PhysicalDistribute[DistributionSpecHash]
-----------------------------------PhysicalProject
-------------------------------------PhysicalOlapScan[customer_demographics]
-----------------------------PhysicalDistribute[DistributionSpecHash]
-------------------------------PhysicalProject
---------------------------------PhysicalOlapScan[customer_address]
-------------------------PhysicalDistribute[DistributionSpecReplicated]
+----------------------------------------------PhysicalOlapScan[store_sales] apply RFs: RF2
+--------------------------------------------PhysicalDistribute[DistributionSpecReplicated]
+----------------------------------------------PhysicalProject
+------------------------------------------------filter((date_dim.d_qoy < 4) and (date_dim.d_year = 2001))
+--------------------------------------------------PhysicalOlapScan[date_dim]
+--------------------------------------PhysicalDistribute[DistributionSpecHash]
+----------------------------------------PhysicalProject
+------------------------------------------PhysicalOlapScan[customer]
+----------------------------------PhysicalDistribute[DistributionSpecHash]
+------------------------------------PhysicalProject
+--------------------------------------PhysicalOlapScan[customer_demographics]
+------------------------------PhysicalDistribute[DistributionSpecHash]
+--------------------------------PhysicalProject
+----------------------------------PhysicalOlapScan[customer_address]
+------------------------PhysicalDistribute[DistributionSpecHash]
 --------------------------PhysicalProject
 ----------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF1 d_date_sk->[ws_sold_date_sk]
 ------------------------------PhysicalProject
@@ -45,7 +46,7 @@ PhysicalResultSink
 --------------------------------PhysicalProject
 ----------------------------------filter((date_dim.d_qoy < 4) and (date_dim.d_year = 2001))
 ------------------------------------PhysicalOlapScan[date_dim]
-----------------------PhysicalDistribute[DistributionSpecReplicated]
+----------------------PhysicalDistribute[DistributionSpecHash]
 ------------------------PhysicalProject
 --------------------------hashJoin[INNER_JOIN] hashCondition=((catalog_sales.cs_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF0 d_date_sk->[cs_sold_date_sk]
 ----------------------------PhysicalProject
diff --git a/regression-test/data/nereids_tpcds_shape_sf100_p0/no_stats_shape/query10.out b/regression-test/data/nereids_tpcds_shape_sf100_p0/no_stats_shape/query10.out
index 9e185b3a54..558e86d696 100644
--- a/regression-test/data/nereids_tpcds_shape_sf100_p0/no_stats_shape/query10.out
+++ b/regression-test/data/nereids_tpcds_shape_sf100_p0/no_stats_shape/query10.out
@@ -12,30 +12,31 @@ PhysicalResultSink
 ------------------filter((ifnull($c$1, FALSE) OR ifnull($c$2, FALSE)))
 --------------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((c.c_customer_sk = catalog_sales.cs_ship_customer_sk)) otherCondition=()
 ----------------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((c.c_customer_sk = web_sales.ws_bill_customer_sk)) otherCondition=()
-------------------------PhysicalProject
---------------------------hashJoin[INNER_JOIN] hashCondition=((c.c_current_addr_sk = ca.ca_address_sk)) otherCondition=() build RFs:RF5 ca_address_sk->[c_current_addr_sk]
-----------------------------hashJoin[INNER_JOIN] hashCondition=((customer_demographics.cd_demo_sk = c.c_current_cdemo_sk)) otherCondition=() build RFs:RF4 cd_demo_sk->[c_current_cdemo_sk]
-------------------------------PhysicalDistribute[DistributionSpecHash]
---------------------------------hashJoin[RIGHT_SEMI_JOIN] hashCondition=((c.c_customer_sk = store_sales.ss_customer_sk)) otherCondition=() build RFs:RF3 c_customer_sk->[ss_customer_sk]
-----------------------------------PhysicalDistribute[DistributionSpecHash]
-------------------------------------PhysicalProject
---------------------------------------hashJoin[INNER_JOIN] hashCondition=((store_sales.ss_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF2 d_date_sk->[ss_sold_date_sk]
-----------------------------------------PhysicalProject
-------------------------------------------PhysicalOlapScan[store_sales] apply RFs: RF2 RF3
-----------------------------------------PhysicalDistribute[DistributionSpecReplicated]
+------------------------PhysicalDistribute[DistributionSpecHash]
+--------------------------PhysicalProject
+----------------------------hashJoin[INNER_JOIN] hashCondition=((c.c_current_addr_sk = ca.ca_address_sk)) otherCondition=() build RFs:RF5 ca_address_sk->[c_current_addr_sk]
+------------------------------hashJoin[INNER_JOIN] hashCondition=((customer_demographics.cd_demo_sk = c.c_current_cdemo_sk)) otherCondition=() build RFs:RF4 cd_demo_sk->[c_current_cdemo_sk]
+--------------------------------PhysicalDistribute[DistributionSpecHash]
+----------------------------------hashJoin[RIGHT_SEMI_JOIN] hashCondition=((c.c_customer_sk = store_sales.ss_customer_sk)) otherCondition=() build RFs:RF3 c_customer_sk->[ss_customer_sk]
+------------------------------------PhysicalDistribute[DistributionSpecHash]
+--------------------------------------PhysicalProject
+----------------------------------------hashJoin[INNER_JOIN] hashCondition=((store_sales.ss_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF2 d_date_sk->[ss_sold_date_sk]
 ------------------------------------------PhysicalProject
---------------------------------------------filter((date_dim.d_moy <= 4) and (date_dim.d_moy >= 1) and (date_dim.d_year = 2001))
-----------------------------------------------PhysicalOlapScan[date_dim]
-----------------------------------PhysicalDistribute[DistributionSpecHash]
-------------------------------------PhysicalProject
---------------------------------------PhysicalOlapScan[customer] apply RFs: RF4 RF5
-------------------------------PhysicalDistribute[DistributionSpecHash]
---------------------------------PhysicalOlapScan[customer_demographics]
-----------------------------PhysicalDistribute[DistributionSpecReplicated]
-------------------------------PhysicalProject
---------------------------------filter(ca_county IN ('Cochran County', 'Kandiyohi County', 'Marquette County', 'Storey County', 'Warren County'))
-----------------------------------PhysicalOlapScan[customer_address]
-------------------------PhysicalDistribute[DistributionSpecReplicated]
+--------------------------------------------PhysicalOlapScan[store_sales] apply RFs: RF2 RF3
+------------------------------------------PhysicalDistribute[DistributionSpecReplicated]
+--------------------------------------------PhysicalProject
+----------------------------------------------filter((date_dim.d_moy <= 4) and (date_dim.d_moy >= 1) and (date_dim.d_year = 2001))
+------------------------------------------------PhysicalOlapScan[date_dim]
+------------------------------------PhysicalDistribute[DistributionSpecHash]
+--------------------------------------PhysicalProject
+----------------------------------------PhysicalOlapScan[customer] apply RFs: RF4 RF5
+--------------------------------PhysicalDistribute[DistributionSpecHash]
+----------------------------------PhysicalOlapScan[customer_demographics]
+------------------------------PhysicalDistribute[DistributionSpecReplicated]
+--------------------------------PhysicalProject
+----------------------------------filter(ca_county IN ('Cochran County', 'Kandiyohi County', 'Marquette County', 'Storey County', 'Warren County'))
+------------------------------------PhysicalOlapScan[customer_address]
+------------------------PhysicalDistribute[DistributionSpecHash]
 --------------------------PhysicalProject
 ----------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF1 d_date_sk->[ws_sold_date_sk]
 ------------------------------PhysicalProject
@@ -44,7 +45,7 @@ PhysicalResultSink
 --------------------------------PhysicalProject
 ----------------------------------filter((date_dim.d_moy <= 4) and (date_dim.d_moy >= 1) and (date_dim.d_year = 2001))
 ------------------------------------PhysicalOlapScan[date_dim]
-----------------------PhysicalDistribute[DistributionSpecReplicated]
+----------------------PhysicalDistribute[DistributionSpecHash]
 ------------------------PhysicalProject
 --------------------------hashJoin[INNER_JOIN] hashCondition=((catalog_sales.cs_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF0 d_date_sk->[cs_sold_date_sk]
 ----------------------------PhysicalProject
diff --git a/regression-test/data/nereids_tpcds_shape_sf100_p0/no_stats_shape/query35.out b/regression-test/data/nereids_tpcds_shape_sf100_p0/no_stats_shape/query35.out
index 03410c1b28..cc92a34b9e 100644
--- a/regression-test/data/nereids_tpcds_shape_sf100_p0/no_stats_shape/query35.out
+++ b/regression-test/data/nereids_tpcds_shape_sf100_p0/no_stats_shape/query35.out
@@ -12,31 +12,32 @@ PhysicalResultSink
 ------------------filter((ifnull($c$1, FALSE) OR ifnull($c$2, FALSE)))
 --------------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((c.c_customer_sk = catalog_sales.cs_ship_customer_sk)) otherCondition=()
 ----------------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((c.c_customer_sk = web_sales.ws_bill_customer_sk)) otherCondition=()
-------------------------PhysicalProject
---------------------------hashJoin[INNER_JOIN] hashCondition=((c.c_current_addr_sk = ca.ca_address_sk)) otherCondition=() build RFs:RF5 ca_address_sk->[c_current_addr_sk]
-----------------------------PhysicalDistribute[DistributionSpecHash]
-------------------------------hashJoin[INNER_JOIN] hashCondition=((customer_demographics.cd_demo_sk = c.c_current_cdemo_sk)) otherCondition=() build RFs:RF4 cd_demo_sk->[c_current_cdemo_sk]
---------------------------------PhysicalDistribute[DistributionSpecHash]
-----------------------------------hashJoin[RIGHT_SEMI_JOIN] hashCondition=((c.c_customer_sk = store_sales.ss_customer_sk)) otherCondition=() build RFs:RF3 c_customer_sk->[ss_customer_sk]
-------------------------------------PhysicalDistribute[DistributionSpecHash]
---------------------------------------PhysicalProject
-----------------------------------------hashJoin[INNER_JOIN] hashCondition=((store_sales.ss_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF2 d_date_sk->[ss_sold_date_sk]
-------------------------------------------PhysicalProject
---------------------------------------------PhysicalOlapScan[store_sales] apply RFs: RF2 RF3
-------------------------------------------PhysicalDistribute[DistributionSpecReplicated]
+------------------------PhysicalDistribute[DistributionSpecHash]
+--------------------------PhysicalProject
+----------------------------hashJoin[INNER_JOIN] hashCondition=((c.c_current_addr_sk = ca.ca_address_sk)) otherCondition=() build RFs:RF5 ca_address_sk->[c_current_addr_sk]
+------------------------------PhysicalDistribute[DistributionSpecHash]
+--------------------------------hashJoin[INNER_JOIN] hashCondition=((customer_demographics.cd_demo_sk = c.c_current_cdemo_sk)) otherCondition=() build RFs:RF4 cd_demo_sk->[c_current_cdemo_sk]
+----------------------------------PhysicalDistribute[DistributionSpecHash]
+------------------------------------hashJoin[RIGHT_SEMI_JOIN] hashCondition=((c.c_customer_sk = store_sales.ss_customer_sk)) otherCondition=() build RFs:RF3 c_customer_sk->[ss_customer_sk]
+--------------------------------------PhysicalDistribute[DistributionSpecHash]
+----------------------------------------PhysicalProject
+------------------------------------------hashJoin[INNER_JOIN] hashCondition=((store_sales.ss_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF2 d_date_sk->[ss_sold_date_sk]
 --------------------------------------------PhysicalProject
-----------------------------------------------filter((date_dim.d_qoy < 4) and (date_dim.d_year = 2001))
-------------------------------------------------PhysicalOlapScan[date_dim]
-------------------------------------PhysicalDistribute[DistributionSpecHash]
---------------------------------------PhysicalProject
-----------------------------------------PhysicalOlapScan[customer] apply RFs: RF4 RF5
---------------------------------PhysicalDistribute[DistributionSpecHash]
-----------------------------------PhysicalProject
-------------------------------------PhysicalOlapScan[customer_demographics]
-----------------------------PhysicalDistribute[DistributionSpecHash]
-------------------------------PhysicalProject
---------------------------------PhysicalOlapScan[customer_address]
-------------------------PhysicalDistribute[DistributionSpecReplicated]
+----------------------------------------------PhysicalOlapScan[store_sales] apply RFs: RF2 RF3
+--------------------------------------------PhysicalDistribute[DistributionSpecReplicated]
+----------------------------------------------PhysicalProject
+------------------------------------------------filter((date_dim.d_qoy < 4) and (date_dim.d_year = 2001))
+--------------------------------------------------PhysicalOlapScan[date_dim]
+--------------------------------------PhysicalDistribute[DistributionSpecHash]
+----------------------------------------PhysicalProject
+------------------------------------------PhysicalOlapScan[customer] apply RFs: RF4 RF5
+----------------------------------PhysicalDistribute[DistributionSpecHash]
+------------------------------------PhysicalProject
+--------------------------------------PhysicalOlapScan[customer_demographics]
+------------------------------PhysicalDistribute[DistributionSpecHash]
+--------------------------------PhysicalProject
+----------------------------------PhysicalOlapScan[customer_address]
+------------------------PhysicalDistribute[DistributionSpecHash]
 --------------------------PhysicalProject
 ----------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF1 d_date_sk->[ws_sold_date_sk]
 ------------------------------PhysicalProject
@@ -45,7 +46,7 @@ PhysicalResultSink
 --------------------------------PhysicalProject
 ----------------------------------filter((date_dim.d_qoy < 4) and (date_dim.d_year = 2001))
 ------------------------------------PhysicalOlapScan[date_dim]
-----------------------PhysicalDistribute[DistributionSpecReplicated]
+----------------------PhysicalDistribute[DistributionSpecHash]
 ------------------------PhysicalProject
 --------------------------hashJoin[INNER_JOIN] hashCondition=((catalog_sales.cs_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF0 d_date_sk->[cs_sold_date_sk]
 ----------------------------PhysicalProject
diff --git a/regression-test/data/nereids_tpcds_shape_sf100_p0/rf_prune/query10.out b/regression-test/data/nereids_tpcds_shape_sf100_p0/rf_prune/query10.out
index dc3529bdf2..923159c5f6 100644
--- a/regression-test/data/nereids_tpcds_shape_sf100_p0/rf_prune/query10.out
+++ b/regression-test/data/nereids_tpcds_shape_sf100_p0/rf_prune/query10.out
@@ -5,50 +5,51 @@ PhysicalResultSink
 ----PhysicalTopN[MERGE_SORT]
 ------PhysicalDistribute[DistributionSpecGather]
 --------PhysicalTopN[LOCAL_SORT]
-----------hashAgg[LOCAL]
-------------PhysicalProject
---------------filter((ifnull($c$1, FALSE) OR ifnull($c$2, FALSE)))
-----------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((c.c_customer_sk = catalog_sales.cs_ship_customer_sk)) otherCondition=()
-------------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((c.c_customer_sk = web_sales.ws_bill_customer_sk)) otherCondition=()
---------------------PhysicalProject
-----------------------hashJoin[INNER_JOIN] hashCondition=((customer_demographics.cd_demo_sk = c.c_current_cdemo_sk)) otherCondition=() build RFs:RF5 c_current_cdemo_sk->[cd_demo_sk]
-------------------------PhysicalOlapScan[customer_demographics] apply RFs: RF5
-------------------------PhysicalDistribute[DistributionSpecReplicated]
---------------------------PhysicalProject
-----------------------------hashJoin[RIGHT_SEMI_JOIN] hashCondition=((c.c_customer_sk = store_sales.ss_customer_sk)) otherCondition=() build RFs:RF4 c_customer_sk->[ss_customer_sk]
-------------------------------PhysicalDistribute[DistributionSpecHash]
---------------------------------PhysicalProject
-----------------------------------hashJoin[INNER_JOIN] hashCondition=((store_sales.ss_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF3 d_date_sk->[ss_sold_date_sk]
-------------------------------------PhysicalProject
---------------------------------------PhysicalOlapScan[store_sales] apply RFs: RF3 RF4
-------------------------------------PhysicalDistribute[DistributionSpecReplicated]
---------------------------------------PhysicalProject
-----------------------------------------filter((date_dim.d_moy <= 4) and (date_dim.d_moy >= 1) and (date_dim.d_year = 2001))
-------------------------------------------PhysicalOlapScan[date_dim]
-------------------------------PhysicalDistribute[DistributionSpecHash]
---------------------------------hashJoin[INNER_JOIN] hashCondition=((c.c_current_addr_sk = ca.ca_address_sk)) otherCondition=() build RFs:RF2 ca_address_sk->[c_current_addr_sk]
+----------hashAgg[GLOBAL]
+------------PhysicalDistribute[DistributionSpecHash]
+--------------hashAgg[LOCAL]
+----------------PhysicalProject
+------------------filter((ifnull($c$1, FALSE) OR ifnull($c$2, FALSE)))
+--------------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((c.c_customer_sk = catalog_sales.cs_ship_customer_sk)) otherCondition=()
+----------------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((c.c_customer_sk = web_sales.ws_bill_customer_sk)) otherCondition=()
+------------------------PhysicalProject
+--------------------------hashJoin[RIGHT_SEMI_JOIN] hashCondition=((c.c_customer_sk = store_sales.ss_customer_sk)) otherCondition=() build RFs:RF5 c_customer_sk->[ss_customer_sk]
+----------------------------PhysicalDistribute[DistributionSpecHash]
+------------------------------PhysicalProject
+--------------------------------hashJoin[INNER_JOIN] hashCondition=((store_sales.ss_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF4 d_date_sk->[ss_sold_date_sk]
 ----------------------------------PhysicalProject
-------------------------------------PhysicalOlapScan[customer] apply RFs: RF2
+------------------------------------PhysicalOlapScan[store_sales] apply RFs: RF4 RF5
 ----------------------------------PhysicalDistribute[DistributionSpecReplicated]
 ------------------------------------PhysicalProject
---------------------------------------filter(ca_county IN ('Cochran County', 'Kandiyohi County', 'Marquette County', 'Storey County', 'Warren County'))
-----------------------------------------PhysicalOlapScan[customer_address]
---------------------PhysicalDistribute[DistributionSpecReplicated]
-----------------------PhysicalProject
-------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF1 d_date_sk->[ws_sold_date_sk]
+--------------------------------------filter((date_dim.d_moy <= 4) and (date_dim.d_moy >= 1) and (date_dim.d_year = 2001))
+----------------------------------------PhysicalOlapScan[date_dim]
+----------------------------PhysicalDistribute[DistributionSpecHash]
+------------------------------hashJoin[INNER_JOIN] hashCondition=((customer_demographics.cd_demo_sk = c.c_current_cdemo_sk)) otherCondition=() build RFs:RF3 c_current_cdemo_sk->[cd_demo_sk]
+--------------------------------PhysicalOlapScan[customer_demographics] apply RFs: RF3
+--------------------------------PhysicalDistribute[DistributionSpecReplicated]
+----------------------------------hashJoin[INNER_JOIN] hashCondition=((c.c_current_addr_sk = ca.ca_address_sk)) otherCondition=() build RFs:RF2 ca_address_sk->[c_current_addr_sk]
+------------------------------------PhysicalProject
+--------------------------------------PhysicalOlapScan[customer] apply RFs: RF2
+------------------------------------PhysicalDistribute[DistributionSpecReplicated]
+--------------------------------------PhysicalProject
+----------------------------------------filter(ca_county IN ('Cochran County', 'Kandiyohi County', 'Marquette County', 'Storey County', 'Warren County'))
+------------------------------------------PhysicalOlapScan[customer_address]
+------------------------PhysicalDistribute[DistributionSpecHash]
 --------------------------PhysicalProject
-----------------------------PhysicalOlapScan[web_sales] apply RFs: RF1
---------------------------PhysicalDistribute[DistributionSpecReplicated]
-----------------------------PhysicalProject
-------------------------------filter((date_dim.d_moy <= 4) and (date_dim.d_moy >= 1) and (date_dim.d_year = 2001))
---------------------------------PhysicalOlapScan[date_dim]
-------------------PhysicalDistribute[DistributionSpecReplicated]
---------------------PhysicalProject
-----------------------hashJoin[INNER_JOIN] hashCondition=((catalog_sales.cs_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF0 d_date_sk->[cs_sold_date_sk]
+----------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF1 d_date_sk->[ws_sold_date_sk]
+------------------------------PhysicalProject
+--------------------------------PhysicalOlapScan[web_sales] apply RFs: RF1
+------------------------------PhysicalDistribute[DistributionSpecReplicated]
+--------------------------------PhysicalProject
+----------------------------------filter((date_dim.d_moy <= 4) and (date_dim.d_moy >= 1) and (date_dim.d_year = 2001))
+------------------------------------PhysicalOlapScan[date_dim]
+----------------------PhysicalDistribute[DistributionSpecHash]
 ------------------------PhysicalProject
---------------------------PhysicalOlapScan[catalog_sales] apply RFs: RF0
-------------------------PhysicalDistribute[DistributionSpecReplicated]
---------------------------PhysicalProject
-----------------------------filter((date_dim.d_moy <= 4) and (date_dim.d_moy >= 1) and (date_dim.d_year = 2001))
-------------------------------PhysicalOlapScan[date_dim]
+--------------------------hashJoin[INNER_JOIN] hashCondition=((catalog_sales.cs_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF0 d_date_sk->[cs_sold_date_sk]
+----------------------------PhysicalProject
+------------------------------PhysicalOlapScan[catalog_sales] apply RFs: RF0
+----------------------------PhysicalDistribute[DistributionSpecReplicated]
+------------------------------PhysicalProject
+--------------------------------filter((date_dim.d_moy <= 4) and (date_dim.d_moy >= 1) and (date_dim.d_year = 2001))
+----------------------------------PhysicalOlapScan[date_dim]
 
diff --git a/regression-test/data/nereids_tpcds_shape_sf100_p0/rf_prune/query35.out b/regression-test/data/nereids_tpcds_shape_sf100_p0/rf_prune/query35.out
index 7f0a43176d..1ea14729a9 100644
--- a/regression-test/data/nereids_tpcds_shape_sf100_p0/rf_prune/query35.out
+++ b/regression-test/data/nereids_tpcds_shape_sf100_p0/rf_prune/query35.out
@@ -12,32 +12,33 @@ PhysicalResultSink
 ------------------filter((ifnull($c$1, FALSE) OR ifnull($c$2, FALSE)))
 --------------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((c.c_customer_sk = catalog_sales.cs_ship_customer_sk)) otherCondition=()
 ----------------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((c.c_customer_sk = web_sales.ws_bill_customer_sk)) otherCondition=()
-------------------------PhysicalProject
---------------------------hashJoin[INNER_JOIN] hashCondition=((customer_demographics.cd_demo_sk = c.c_current_cdemo_sk)) otherCondition=()
-----------------------------PhysicalDistribute[DistributionSpecHash]
-------------------------------PhysicalProject
---------------------------------hashJoin[INNER_JOIN] hashCondition=((c.c_current_addr_sk = ca.ca_address_sk)) otherCondition=()
-----------------------------------PhysicalDistribute[DistributionSpecHash]
-------------------------------------hashJoin[RIGHT_SEMI_JOIN] hashCondition=((c.c_customer_sk = store_sales.ss_customer_sk)) otherCondition=()
---------------------------------------PhysicalDistribute[DistributionSpecHash]
-----------------------------------------PhysicalProject
-------------------------------------------hashJoin[INNER_JOIN] hashCondition=((store_sales.ss_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF2 d_date_sk->[ss_sold_date_sk]
---------------------------------------------PhysicalProject
-----------------------------------------------PhysicalOlapScan[store_sales] apply RFs: RF2
---------------------------------------------PhysicalDistribute[DistributionSpecReplicated]
+------------------------PhysicalDistribute[DistributionSpecHash]
+--------------------------PhysicalProject
+----------------------------hashJoin[INNER_JOIN] hashCondition=((customer_demographics.cd_demo_sk = c.c_current_cdemo_sk)) otherCondition=()
+------------------------------PhysicalDistribute[DistributionSpecHash]
+--------------------------------PhysicalProject
+----------------------------------hashJoin[INNER_JOIN] hashCondition=((c.c_current_addr_sk = ca.ca_address_sk)) otherCondition=()
+------------------------------------PhysicalDistribute[DistributionSpecHash]
+--------------------------------------hashJoin[RIGHT_SEMI_JOIN] hashCondition=((c.c_customer_sk = store_sales.ss_customer_sk)) otherCondition=()
+----------------------------------------PhysicalDistribute[DistributionSpecHash]
+------------------------------------------PhysicalProject
+--------------------------------------------hashJoin[INNER_JOIN] hashCondition=((store_sales.ss_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF2 d_date_sk->[ss_sold_date_sk]
 ----------------------------------------------PhysicalProject
-------------------------------------------------filter((date_dim.d_qoy < 4) and (date_dim.d_year = 2001))
---------------------------------------------------PhysicalOlapScan[date_dim]
---------------------------------------PhysicalDistribute[DistributionSpecHash]
-----------------------------------------PhysicalProject
-------------------------------------------PhysicalOlapScan[customer]
-----------------------------------PhysicalDistribute[DistributionSpecHash]
-------------------------------------PhysicalProject
---------------------------------------PhysicalOlapScan[customer_address]
-----------------------------PhysicalDistribute[DistributionSpecHash]
-------------------------------PhysicalProject
---------------------------------PhysicalOlapScan[customer_demographics]
-------------------------PhysicalDistribute[DistributionSpecReplicated]
+------------------------------------------------PhysicalOlapScan[store_sales] apply RFs: RF2
+----------------------------------------------PhysicalDistribute[DistributionSpecReplicated]
+------------------------------------------------PhysicalProject
+--------------------------------------------------filter((date_dim.d_qoy < 4) and (date_dim.d_year = 2001))
+----------------------------------------------------PhysicalOlapScan[date_dim]
+----------------------------------------PhysicalDistribute[DistributionSpecHash]
+------------------------------------------PhysicalProject
+--------------------------------------------PhysicalOlapScan[customer]
+------------------------------------PhysicalDistribute[DistributionSpecHash]
+--------------------------------------PhysicalProject
+----------------------------------------PhysicalOlapScan[customer_address]
+------------------------------PhysicalDistribute[DistributionSpecHash]
+--------------------------------PhysicalProject
+----------------------------------PhysicalOlapScan[customer_demographics]
+------------------------PhysicalDistribute[DistributionSpecHash]
 --------------------------PhysicalProject
 ----------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF1 d_date_sk->[ws_sold_date_sk]
 ------------------------------PhysicalProject
@@ -46,7 +47,7 @@ PhysicalResultSink
 --------------------------------PhysicalProject
 ----------------------------------filter((date_dim.d_qoy < 4) and (date_dim.d_year = 2001))
 ------------------------------------PhysicalOlapScan[date_dim]
-----------------------PhysicalDistribute[DistributionSpecReplicated]
+----------------------PhysicalDistribute[DistributionSpecHash]
 ------------------------PhysicalProject
 --------------------------hashJoin[INNER_JOIN] hashCondition=((catalog_sales.cs_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF0 d_date_sk->[cs_sold_date_sk]
 ----------------------------PhysicalProject
diff --git a/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query10.out b/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query10.out
index dc3529bdf2..923159c5f6 100644
--- a/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query10.out
+++ b/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query10.out
@@ -5,50 +5,51 @@ PhysicalResultSink
 ----PhysicalTopN[MERGE_SORT]
 ------PhysicalDistribute[DistributionSpecGather]
 --------PhysicalTopN[LOCAL_SORT]
-----------hashAgg[LOCAL]
-------------PhysicalProject
---------------filter((ifnull($c$1, FALSE) OR ifnull($c$2, FALSE)))
-----------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((c.c_customer_sk = catalog_sales.cs_ship_customer_sk)) otherCondition=()
-------------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((c.c_customer_sk = web_sales.ws_bill_customer_sk)) otherCondition=()
---------------------PhysicalProject
-----------------------hashJoin[INNER_JOIN] hashCondition=((customer_demographics.cd_demo_sk = c.c_current_cdemo_sk)) otherCondition=() build RFs:RF5 c_current_cdemo_sk->[cd_demo_sk]
-------------------------PhysicalOlapScan[customer_demographics] apply RFs: RF5
-------------------------PhysicalDistribute[DistributionSpecReplicated]
---------------------------PhysicalProject
-----------------------------hashJoin[RIGHT_SEMI_JOIN] hashCondition=((c.c_customer_sk = store_sales.ss_customer_sk)) otherCondition=() build RFs:RF4 c_customer_sk->[ss_customer_sk]
-------------------------------PhysicalDistribute[DistributionSpecHash]
---------------------------------PhysicalProject
-----------------------------------hashJoin[INNER_JOIN] hashCondition=((store_sales.ss_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF3 d_date_sk->[ss_sold_date_sk]
-------------------------------------PhysicalProject
---------------------------------------PhysicalOlapScan[store_sales] apply RFs: RF3 RF4
-------------------------------------PhysicalDistribute[DistributionSpecReplicated]
---------------------------------------PhysicalProject
-----------------------------------------filter((date_dim.d_moy <= 4) and (date_dim.d_moy >= 1) and (date_dim.d_year = 2001))
-------------------------------------------PhysicalOlapScan[date_dim]
-------------------------------PhysicalDistribute[DistributionSpecHash]
---------------------------------hashJoin[INNER_JOIN] hashCondition=((c.c_current_addr_sk = ca.ca_address_sk)) otherCondition=() build RFs:RF2 ca_address_sk->[c_current_addr_sk]
+----------hashAgg[GLOBAL]
+------------PhysicalDistribute[DistributionSpecHash]
+--------------hashAgg[LOCAL]
+----------------PhysicalProject
+------------------filter((ifnull($c$1, FALSE) OR ifnull($c$2, FALSE)))
+--------------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((c.c_customer_sk = catalog_sales.cs_ship_customer_sk)) otherCondition=()
+----------------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((c.c_customer_sk = web_sales.ws_bill_customer_sk)) otherCondition=()
+------------------------PhysicalProject
+--------------------------hashJoin[RIGHT_SEMI_JOIN] hashCondition=((c.c_customer_sk = store_sales.ss_customer_sk)) otherCondition=() build RFs:RF5 c_customer_sk->[ss_customer_sk]
+----------------------------PhysicalDistribute[DistributionSpecHash]
+------------------------------PhysicalProject
+--------------------------------hashJoin[INNER_JOIN] hashCondition=((store_sales.ss_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF4 d_date_sk->[ss_sold_date_sk]
 ----------------------------------PhysicalProject
-------------------------------------PhysicalOlapScan[customer] apply RFs: RF2
+------------------------------------PhysicalOlapScan[store_sales] apply RFs: RF4 RF5
 ----------------------------------PhysicalDistribute[DistributionSpecReplicated]
 ------------------------------------PhysicalProject
---------------------------------------filter(ca_county IN ('Cochran County', 'Kandiyohi County', 'Marquette County', 'Storey County', 'Warren County'))
-----------------------------------------PhysicalOlapScan[customer_address]
---------------------PhysicalDistribute[DistributionSpecReplicated]
-----------------------PhysicalProject
-------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF1 d_date_sk->[ws_sold_date_sk]
+--------------------------------------filter((date_dim.d_moy <= 4) and (date_dim.d_moy >= 1) and (date_dim.d_year = 2001))
+----------------------------------------PhysicalOlapScan[date_dim]
+----------------------------PhysicalDistribute[DistributionSpecHash]
+------------------------------hashJoin[INNER_JOIN] hashCondition=((customer_demographics.cd_demo_sk = c.c_current_cdemo_sk)) otherCondition=() build RFs:RF3 c_current_cdemo_sk->[cd_demo_sk]
+--------------------------------PhysicalOlapScan[customer_demographics] apply RFs: RF3
+--------------------------------PhysicalDistribute[DistributionSpecReplicated]
+----------------------------------hashJoin[INNER_JOIN] hashCondition=((c.c_current_addr_sk = ca.ca_address_sk)) otherCondition=() build RFs:RF2 ca_address_sk->[c_current_addr_sk]
+------------------------------------PhysicalProject
+--------------------------------------PhysicalOlapScan[customer] apply RFs: RF2
+------------------------------------PhysicalDistribute[DistributionSpecReplicated]
+--------------------------------------PhysicalProject
+----------------------------------------filter(ca_county IN ('Cochran County', 'Kandiyohi County', 'Marquette County', 'Storey County', 'Warren County'))
+------------------------------------------PhysicalOlapScan[customer_address]
+------------------------PhysicalDistribute[DistributionSpecHash]
 --------------------------PhysicalProject
-----------------------------PhysicalOlapScan[web_sales] apply RFs: RF1
---------------------------PhysicalDistribute[DistributionSpecReplicated]
-----------------------------PhysicalProject
-------------------------------filter((date_dim.d_moy <= 4) and (date_dim.d_moy >= 1) and (date_dim.d_year = 2001))
---------------------------------PhysicalOlapScan[date_dim]
-------------------PhysicalDistribute[DistributionSpecReplicated]
---------------------PhysicalProject
-----------------------hashJoin[INNER_JOIN] hashCondition=((catalog_sales.cs_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF0 d_date_sk->[cs_sold_date_sk]
+----------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF1 d_date_sk->[ws_sold_date_sk]
+------------------------------PhysicalProject
+--------------------------------PhysicalOlapScan[web_sales] apply RFs: RF1
+------------------------------PhysicalDistribute[DistributionSpecReplicated]
+--------------------------------PhysicalProject
+----------------------------------filter((date_dim.d_moy <= 4) and (date_dim.d_moy >= 1) and (date_dim.d_year = 2001))
+------------------------------------PhysicalOlapScan[date_dim]
+----------------------PhysicalDistribute[DistributionSpecHash]
 ------------------------PhysicalProject
---------------------------PhysicalOlapScan[catalog_sales] apply RFs: RF0
-------------------------PhysicalDistribute[DistributionSpecReplicated]
---------------------------PhysicalProject
-----------------------------filter((date_dim.d_moy <= 4) and (date_dim.d_moy >= 1) and (date_dim.d_year = 2001))
-------------------------------PhysicalOlapScan[date_dim]
+--------------------------hashJoin[INNER_JOIN] hashCondition=((catalog_sales.cs_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF0 d_date_sk->[cs_sold_date_sk]
+----------------------------PhysicalProject
+------------------------------PhysicalOlapScan[catalog_sales] apply RFs: RF0
+----------------------------PhysicalDistribute[DistributionSpecReplicated]
+------------------------------PhysicalProject
+--------------------------------filter((date_dim.d_moy <= 4) and (date_dim.d_moy >= 1) and (date_dim.d_year = 2001))
+----------------------------------PhysicalOlapScan[date_dim]
 
diff --git a/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query35.out b/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query35.out
index 35d0777a6f..0874f2dd35 100644
--- a/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query35.out
+++ b/regression-test/data/nereids_tpcds_shape_sf100_p0/shape/query35.out
@@ -12,32 +12,33 @@ PhysicalResultSink
 ------------------filter((ifnull($c$1, FALSE) OR ifnull($c$2, FALSE)))
 --------------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((c.c_customer_sk = catalog_sales.cs_ship_customer_sk)) otherCondition=()
 ----------------------hashJoin[LEFT_SEMI_JOIN] hashCondition=((c.c_customer_sk = web_sales.ws_bill_customer_sk)) otherCondition=()
-------------------------PhysicalProject
---------------------------hashJoin[INNER_JOIN] hashCondition=((customer_demographics.cd_demo_sk = c.c_current_cdemo_sk)) otherCondition=() build RFs:RF5 cd_demo_sk->[c_current_cdemo_sk]
-----------------------------PhysicalDistribute[DistributionSpecHash]
-------------------------------PhysicalProject
---------------------------------hashJoin[INNER_JOIN] hashCondition=((c.c_current_addr_sk = ca.ca_address_sk)) otherCondition=() build RFs:RF4 ca_address_sk->[c_current_addr_sk]
-----------------------------------PhysicalDistribute[DistributionSpecHash]
-------------------------------------hashJoin[RIGHT_SEMI_JOIN] hashCondition=((c.c_customer_sk = store_sales.ss_customer_sk)) otherCondition=() build RFs:RF3 c_customer_sk->[ss_customer_sk]
---------------------------------------PhysicalDistribute[DistributionSpecHash]
-----------------------------------------PhysicalProject
-------------------------------------------hashJoin[INNER_JOIN] hashCondition=((store_sales.ss_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF2 d_date_sk->[ss_sold_date_sk]
---------------------------------------------PhysicalProject
-----------------------------------------------PhysicalOlapScan[store_sales] apply RFs: RF2 RF3
---------------------------------------------PhysicalDistribute[DistributionSpecReplicated]
+------------------------PhysicalDistribute[DistributionSpecHash]
+--------------------------PhysicalProject
+----------------------------hashJoin[INNER_JOIN] hashCondition=((customer_demographics.cd_demo_sk = c.c_current_cdemo_sk)) otherCondition=() build RFs:RF5 cd_demo_sk->[c_current_cdemo_sk]
+------------------------------PhysicalDistribute[DistributionSpecHash]
+--------------------------------PhysicalProject
+----------------------------------hashJoin[INNER_JOIN] hashCondition=((c.c_current_addr_sk = ca.ca_address_sk)) otherCondition=() build RFs:RF4 ca_address_sk->[c_current_addr_sk]
+------------------------------------PhysicalDistribute[DistributionSpecHash]
+--------------------------------------hashJoin[RIGHT_SEMI_JOIN] hashCondition=((c.c_customer_sk = store_sales.ss_customer_sk)) otherCondition=() build RFs:RF3 c_customer_sk->[ss_customer_sk]
+----------------------------------------PhysicalDistribute[DistributionSpecHash]
+------------------------------------------PhysicalProject
+--------------------------------------------hashJoin[INNER_JOIN] hashCondition=((store_sales.ss_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF2 d_date_sk->[ss_sold_date_sk]
 ----------------------------------------------PhysicalProject
-------------------------------------------------filter((date_dim.d_qoy < 4) and (date_dim.d_year = 2001))
---------------------------------------------------PhysicalOlapScan[date_dim]
---------------------------------------PhysicalDistribute[DistributionSpecHash]
-----------------------------------------PhysicalProject
-------------------------------------------PhysicalOlapScan[customer] apply RFs: RF4 RF5
-----------------------------------PhysicalDistribute[DistributionSpecHash]
-------------------------------------PhysicalProject
---------------------------------------PhysicalOlapScan[customer_address]
-----------------------------PhysicalDistribute[DistributionSpecHash]
-------------------------------PhysicalProject
---------------------------------PhysicalOlapScan[customer_demographics]
-------------------------PhysicalDistribute[DistributionSpecReplicated]
+------------------------------------------------PhysicalOlapScan[store_sales] apply RFs: RF2 RF3
+----------------------------------------------PhysicalDistribute[DistributionSpecReplicated]
+------------------------------------------------PhysicalProject
+--------------------------------------------------filter((date_dim.d_qoy < 4) and (date_dim.d_year = 2001))
+----------------------------------------------------PhysicalOlapScan[date_dim]
+----------------------------------------PhysicalDistribute[DistributionSpecHash]
+------------------------------------------PhysicalProject
+--------------------------------------------PhysicalOlapScan[customer] apply RFs: RF4 RF5
+------------------------------------PhysicalDistribute[DistributionSpecHash]
+--------------------------------------PhysicalProject
+----------------------------------------PhysicalOlapScan[customer_address]
+------------------------------PhysicalDistribute[DistributionSpecHash]
+--------------------------------PhysicalProject
+----------------------------------PhysicalOlapScan[customer_demographics]
+------------------------PhysicalDistribute[DistributionSpecHash]
 --------------------------PhysicalProject
 ----------------------------hashJoin[INNER_JOIN] hashCondition=((web_sales.ws_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF1 d_date_sk->[ws_sold_date_sk]
 ------------------------------PhysicalProject
@@ -46,7 +47,7 @@ PhysicalResultSink
 --------------------------------PhysicalProject
 ----------------------------------filter((date_dim.d_qoy < 4) and (date_dim.d_year = 2001))
 ------------------------------------PhysicalOlapScan[date_dim]
-----------------------PhysicalDistribute[DistributionSpecReplicated]
+----------------------PhysicalDistribute[DistributionSpecHash]
 ------------------------PhysicalProject
 --------------------------hashJoin[INNER_JOIN] hashCondition=((catalog_sales.cs_sold_date_sk = date_dim.d_date_sk)) otherCondition=() build RFs:RF0 d_date_sk->[cs_sold_date_sk]
 ----------------------------PhysicalProject
diff --git a/regression-test/suites/nereids_p0/subquery/subquery_unnesting.groovy b/regression-test/suites/nereids_p0/subquery/subquery_unnesting.groovy
new file mode 100644
index 0000000000..0a830f1a36
--- /dev/null
+++ b/regression-test/suites/nereids_p0/subquery/subquery_unnesting.groovy
@@ -0,0 +1,132 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite ("subquery_unnesting") {
+    sql """ SET enable_nereids_planner=true"""
+    sql "set enable_fallback_to_original_planner=false"
+
+    sql "drop table if exists t1"
+    sql "drop table if exists t2"
+    sql "drop table if exists t3"
+    
+    sql """create table t1
+                    (k1 bigint, k2 bigint)
+                    ENGINE=OLAP
+            DUPLICATE KEY(k1, k2)
+            COMMENT 'OLAP'
+            DISTRIBUTED BY HASH(k2) BUCKETS 1
+            PROPERTIES (
+            "replication_num" = "1"
+            );"""
+    sql """create table t2
+                    (k1 int, k2 varchar(128), k3 bigint, v1 bigint, v2 bigint)
+                    ENGINE=OLAP
+            DUPLICATE KEY(k1, k2)
+            COMMENT 'OLAP'
+            DISTRIBUTED BY HASH(k2) BUCKETS 1
+            PROPERTIES (
+            "replication_num" = "1"
+            );"""
+    sql """create table t3
+                    (k1 bigint, k2 bigint)
+                    ENGINE=OLAP
+            DUPLICATE KEY(k1, k2)
+            COMMENT 'OLAP'
+            DISTRIBUTED BY HASH(k2) BUCKETS 1
+            PROPERTIES (
+            "replication_num" = "1"
+            );"""
+    sql """insert into t1 values (1,null),(null,1),(1,2), (null,2),(1,3), (2,4), (2,5), (3,3), (3,4), (20,2), (22,3), (24,4),(null,null);"""
+    sql """insert into t2 values (1,'abc',2,3,4), (1,'abcd',3,3,4), (2,'xyz',2,4,2), (2,'uvw',3,4,2), (2,'uvw',3,4,2), (3,'abc',4,5,3), (3,'abc',4,5,3), (null,null,null,null,null);"""
+    sql """insert into t3 values (1,null),(null,1),(1,4), (1,2), (null,3), (2,4), (3,7), (3,9),(null,null),(5,1);"""
+
+    sql "drop table if exists sub_query_correlated_subquery1"
+    sql "drop table if exists sub_query_correlated_subquery3"
+
+    sql """create table if not exists sub_query_correlated_subquery1
+            (k1 bigint, k2 bigint)
+            duplicate key(k1)
+            distributed by hash(k2) buckets 1
+            properties('replication_num' = '1');"""
+    sql """create table if not exists sub_query_correlated_subquery3
+            (k1 int, k2 varchar(128), k3 bigint, v1 bigint, v2 bigint)
+            distributed by hash(k2) buckets 1
+            properties('replication_num' = '1');"""
+    sql """insert into sub_query_correlated_subquery1 values (1,null),(null,1),(1,2), (null,2),(1,3), (2,4), (2,5), (3,3), (3,4), (20,2), (22,3), (24,4),(null,null);"""
+    sql """insert into sub_query_correlated_subquery3 values (1,"abc",2,3,4), (1,"abcd",3,3,4), (2,"xyz",2,4,2), (2,"uvw",3,4,2), (2,"uvw",3,4,2), (3,"abc",4,5,3), (3,"abc",4,5,3), (null,null,null,null,null);"""
+
+    qt_select1 """select * from sub_query_correlated_subquery1 where sub_query_correlated_subquery1.k1 < (select sum(sub_query_correlated_subquery3.k3) from sub_query_correlated_subquery3 where sub_query_correlated_subquery3.v2 = sub_query_correlated_subquery1.k2) OR k1 < 10 order by k1, k2;"""
+    qt_select2 """select * from sub_query_correlated_subquery1 where sub_query_correlated_subquery1.k1 < (select sum(sub_query_correlated_subquery3.k3) from sub_query_correlated_subquery3 where sub_query_correlated_subquery3.v2 = sub_query_correlated_subquery1.k2) order by k1, k2;"""
+    qt_select3 """SELECT * FROM sub_query_correlated_subquery1 WHERE k1 > (SELECT AVG(k1) FROM sub_query_correlated_subquery3) OR k1 < 10 order by k1, k2;"""
+    qt_select4 """select * from sub_query_correlated_subquery1 where sub_query_correlated_subquery1.k1 < (select sum(sub_query_correlated_subquery3.k3) from sub_query_correlated_subquery3 where sub_query_correlated_subquery3.v2 = 2) order by k1, k2;"""
+    qt_select5 """select * from sub_query_correlated_subquery1 where sub_query_correlated_subquery1.k1 in (select sub_query_correlated_subquery3.k3 from sub_query_correlated_subquery3 where sub_query_correlated_subquery3.v2 = sub_query_correlated_subquery1.k2) OR k1 < 10 order by k1, k2;"""
+    qt_select6 """select * from sub_query_correlated_subquery1 where sub_query_correlated_subquery1.k1 in (select sub_query_correlated_subquery3.k3 from sub_query_correlated_subquery3 where sub_query_correlated_subquery3.v2 = sub_query_correlated_subquery1.k2) order by k1, k2;"""
+    qt_select7 """SELECT * FROM sub_query_correlated_subquery1 WHERE k1 IN (SELECT k1 FROM sub_query_correlated_subquery3 WHERE sub_query_correlated_subquery1.k1 > sub_query_correlated_subquery3.k3) OR k1 < 10 ORDER BY k1, k2;"""
+    qt_select8 """select * from sub_query_correlated_subquery1 where sub_query_correlated_subquery1.k1 in (select sub_query_correlated_subquery3.k3 from sub_query_correlated_subquery3 where sub_query_correlated_subquery3.v2 < sub_query_correlated_subquery1.k2) order by k1, k2;"""
+    qt_select9 """SELECT * FROM sub_query_correlated_subquery1 WHERE k1 IN (SELECT k1 FROM sub_query_correlated_subquery3) OR k1 < 10 order by k1, k2;"""
+    qt_select10 """select * from sub_query_correlated_subquery1 where sub_query_correlated_subquery1.k1 in (select sub_query_correlated_subquery3.k3 from sub_query_correlated_subquery3) order by k1, k2;"""
+    qt_select11 """select * from sub_query_correlated_subquery1 where sub_query_correlated_subquery1.k1 not in (select sub_query_correlated_subquery3.k3 from sub_query_correlated_subquery3 where sub_query_correlated_subquery3.v2 = sub_query_correlated_subquery1.k2) or k1 < 10 order by k1, k2;"""
+    qt_select12 """select * from sub_query_correlated_subquery1 where sub_query_correlated_subquery1.k1 not in (select sub_query_correlated_subquery3.k3 from sub_query_correlated_subquery3 where sub_query_correlated_subquery3.v2 = sub_query_correlated_subquery1.k2) order by k1, k2;"""
+    qt_select13 """select * from sub_query_correlated_subquery1 where sub_query_correlated_subquery1.k1 not in (select sub_query_correlated_subquery3.k3 from sub_query_correlated_subquery3 where sub_query_correlated_subquery3.v2 > sub_query_correlated_subquery1.k2) or k1 < 10 order by k1, k2;"""
+    qt_select14 """select * from sub_query_correlated_subquery1 where sub_query_correlated_subquery1.k1 not in (select sub_query_correlated_subquery3.k3 from sub_query_correlated_subquery3 where sub_query_correlated_subquery3.v2 > sub_query_correlated_subquery1.k2) order by k1, k2;"""
+    qt_select15 """select * from sub_query_correlated_subquery1 where sub_query_correlated_subquery1.k1 not in (select sub_query_correlated_subquery3.k3 from sub_query_correlated_subquery3 ) or k1 < 10 order by k1, k2;"""
+    qt_select16 """select * from sub_query_correlated_subquery1 where sub_query_correlated_subquery1.k1 not in (select sub_query_correlated_subquery3.k3 from sub_query_correlated_subquery3 where sub_query_correlated_subquery3.v2 = 2) order by k1, k2;"""
+    qt_select17 """select * from sub_query_correlated_subquery1 where exists (select sub_query_correlated_subquery3.k3 from sub_query_correlated_subquery3 where sub_query_correlated_subquery1.k2 = sub_query_correlated_subquery3.v2) or k1 < 10 order by k1, k2;"""
+    qt_select18 """select * from sub_query_correlated_subquery1 where exists (select sub_query_correlated_subquery3.k3 from sub_query_correlated_subquery3 where sub_query_correlated_subquery1.k2 = sub_query_correlated_subquery3.v2) order by k1, k2;"""
+    qt_select19 """select * from sub_query_correlated_subquery1 where exists (select sub_query_correlated_subquery3.k3 from sub_query_correlated_subquery3 where sub_query_correlated_subquery1.k2 > sub_query_correlated_subquery3.v2) or k1 < 10 order by k1, k2;"""
+    qt_select20 """select * from sub_query_correlated_subquery1 where exists (select sub_query_correlated_subquery3.k3 from sub_query_correlated_subquery3 where sub_query_correlated_subquery1.k2 > sub_query_correlated_subquery3.v2) order by k1, k2;"""
+    qt_select21 """SELECT * FROM sub_query_correlated_subquery1 WHERE EXISTS (SELECT k1 FROM sub_query_correlated_subquery3 WHERE k1 = 10) OR k1 < 10 order by k1, k2;"""
+    qt_select22 """select * from sub_query_correlated_subquery1 where exists (select sub_query_correlated_subquery3.k3 from sub_query_correlated_subquery3) order by k1, k2;"""
+    qt_select23 """select * from sub_query_correlated_subquery1 where not exists (select sub_query_correlated_subquery3.k3 from sub_query_correlated_subquery3 where sub_query_correlated_subquery1.k2 = sub_query_correlated_subquery3.v2) or k1 < 10 order by k1, k2;"""
+    qt_select24 """select * from sub_query_correlated_subquery1 where not exists (select sub_query_correlated_subquery3.k3 from sub_query_correlated_subquery3 where sub_query_correlated_subquery1.k2 = sub_query_correlated_subquery3.v2) order by k1, k2;"""
+    qt_select25 """select * from sub_query_correlated_subquery1 where not exists (select sub_query_correlated_subquery3.k3 from sub_query_correlated_subquery3 where sub_query_correlated_subquery1.k2 > sub_query_correlated_subquery3.v2) or k1 < 10 order by k1, k2;"""
+    qt_select26 """select * from sub_query_correlated_subquery1 where not exists (select sub_query_correlated_subquery3.k3 from sub_query_correlated_subquery3 where sub_query_correlated_subquery1.k2 != sub_query_correlated_subquery3.v2) order by k1, k2;"""
+    qt_select27 """SELECT * FROM sub_query_correlated_subquery1 WHERE not EXISTS (SELECT k1 FROM sub_query_correlated_subquery3 WHERE k1 = 10) OR k1 < 10 order by k1, k2;"""
+    qt_select28 """select * from sub_query_correlated_subquery1 where not exists (select sub_query_correlated_subquery3.k3 from sub_query_correlated_subquery3) order by k1, k2;"""
+    qt_select29 """select * from sub_query_correlated_subquery1 where exists (select sub_query_correlated_subquery3.k3 from sub_query_correlated_subquery3 where sub_query_correlated_subquery1.k2 = sub_query_correlated_subquery3.v2 limit 1) order by k1, k2;"""
+    qt_select30 """select * from sub_query_correlated_subquery1 where exists (select sub_query_correlated_subquery3.k3 from sub_query_correlated_subquery3 where sub_query_correlated_subquery1.k2 > sub_query_correlated_subquery3.v2 limit 1) order by k1, k2;"""
+    qt_select31 """select * from sub_query_correlated_subquery1 where exists (select sub_query_correlated_subquery3.k3 from sub_query_correlated_subquery3 limit 1) order by k1, k2;"""
+
+    qt_select32 """select t1.* from t1 left join t2 on t1.k2 = t2.k3 and t1.k1 < ( select max(k1) from t3 where t1.k2 = t3.k2 ) OR t1.k1 < 10 order by t1.k1, t1.k2;"""
+    qt_select33 """select t1.* from t1 left join t2 on t1.k2 = t2.k3 and t1.k1 < ( select max(k1) from t3 where t1.k2 = t3.k2 ) order by t1.k1, t1.k2;"""
+    qt_select34 """select t1.* from t1 left join t2 on t1.k2 = t2.k3 and t1.k1 < ( select max(k1) from t3 ) or t1.k1 < 10 order by t1.k1, t1.k2;"""
+    qt_select35 """select t1.* from t1 left join t2 on t1.k2 = t2.k3 and t1.k1 < ( select max(k1) from t3 ) order by t1.k1, t1.k2;"""
+    qt_select36 """select t1.* from t1 left join t2 on t1.k2 = t2.k3 and t1.k1 in ( select t3.k1 from t3 where t1.k2 = t3.k2 ) or t1.k1 < 10 order by t1.k1, t1.k2;"""
+    qt_select37 """select t1.* from t1 left join t2 on t1.k2 = t2.k3 and t1.k1 in ( select t3.k1 from t3 where t1.k2 = t3.k2 ) order by t1.k1, t1.k2;"""
+    qt_select38 """select t1.* from t1 left join t2 on t1.k2 = t2.k3 and t1.k1 in ( select t3.k1 from t3 where t1.k2 < t3.k2 ) or t1.k1 < 10 order by t1.k1, t1.k2;"""
+    qt_select39 """select t1.* from t1 left join t2 on t1.k2 = t2.k3 and t1.k1 in ( select t3.k1 from t3 where t1.k2 < t3.k2 ) order by t1.k1, t1.k2;"""
+    qt_select40 """select t1.* from t1 left join t2 on t1.k2 = t2.k3 and t1.k1 in ( select t3.k1 from t3 ) or t1.k1 < 10 order by t1.k1, t1.k2;"""
+    qt_select41 """select t1.* from t1 left join t2 on t1.k2 = t2.k3 and t1.k1 in ( select t3.k1 from t3 ) order by t1.k1, t1.k2;"""
+    qt_select42 """select t1.* from t1 left join t2 on t1.k2 = t2.k3 and t1.k1 not in ( select t3.k1 from t3 where t1.k2 = t3.k2 ) or t1.k1 < 10 order by t1.k1, t1.k2;"""
+    qt_select43 """select t1.* from t1 left join t2 on t1.k2 = t2.k3 and t1.k1 not in ( select t3.k1 from t3 where t1.k2 = t3.k2 ) order by t1.k1, t1.k2;"""
+    qt_select44 """select t1.* from t1 left join t2 on t1.k2 = t2.k3 and t1.k1 not in ( select t3.k1 from t3 where t1.k2 < t3.k2 ) or t1.k1 < 10 order by t1.k1, t1.k2;"""
+    qt_select45 """select t1.* from t1 left join t2 on t1.k2 = t2.k3 and t1.k1 not in ( select t3.k1 from t3 where t1.k2 < t3.k2 ) order by t1.k1, t1.k2;"""
+    qt_select46 """select t1.* from t1 left join t2 on t1.k2 = t2.k3 and t1.k1 not in ( select t3.k1 from t3 ) or t1.k1 < 10 order by t1.k1, t1.k2;"""
+    qt_select47 """select t1.* from t1 left join t2 on t1.k2 = t2.k3 and t1.k1 not in ( select t3.k1 from t3 ) order by t1.k1, t1.k2;"""
+    qt_select48 """select t1.* from t1 left join t2 on t1.k2 = t2.k3 and exists ( select t3.k1 from t3 where t1.k2 = t3.k2 ) or t1.k1 < 10 order by t1.k1, t1.k2;"""
+    qt_select49 """select t1.* from t1 left join t2 on t1.k2 = t2.k3 and exists ( select t3.k1 from t3 where t1.k2 = t3.k2 ) order by t1.k1, t1.k2;"""
+    qt_select50 """select t1.* from t1 left join t2 on t1.k2 = t2.k3 and exists ( select t3.k1 from t3 where t1.k2 < t3.k2 ) or t1.k1 < 10 order by t1.k1, t1.k2;"""
+    qt_select51 """select t1.* from t1 left join t2 on t1.k2 = t2.k3 and exists ( select t3.k1 from t3 where t1.k2 < t3.k2 ) order by t1.k1, t1.k2;"""
+    qt_select52 """select t1.* from t1 left join t2 on t1.k2 = t2.k3 and exists ( select t3.k1 from t3 ) or t1.k1 < 10 order by t1.k1, t1.k2;"""
+    qt_select53 """select t1.* from t1 left join t2 on t1.k2 = t2.k3 and exists ( select t3.k1 from t3 ) order by t1.k1, t1.k2;"""
+    qt_select54 """select t1.* from t1 left join t2 on t1.k2 = t2.k3 and not exists ( select t3.k1 from t3 where t1.k2 = t3.k2 ) or t1.k1 < 10 order by t1.k1, t1.k2;"""
+    qt_select55 """select t1.* from t1 left join t2 on t1.k2 = t2.k3 and not exists ( select t3.k1 from t3 where t1.k2 = t3.k2 ) order by t1.k1, t1.k2;"""
+    qt_select56 """select t1.* from t1 left join t2 on t1.k2 = t2.k3 and not exists ( select t3.k1 from t3 where t1.k2 < t3.k2 ) or t1.k1 < 10 order by t1.k1, t1.k2;"""
+    qt_select57 """select t1.* from t1 left join t2 on t1.k2 = t2.k3 and not exists ( select t3.k1 from t3 where t1.k2 < t3.k2 ) order by t1.k1, t1.k2;"""
+    qt_select58 """select t1.* from t1 left join t2 on t1.k2 = t2.k3 and not exists ( select t3.k1 from t3 ) or t1.k1 < 10 order by t1.k1, t1.k2;"""
+    qt_select59 """select t1.* from t1 left join t2 on t1.k2 = t2.k3 and not exists ( select t3.k1 from t3 ) order by t1.k1, t1.k2;"""
+}
diff --git a/regression-test/suites/nereids_p0/subquery/test_subquery_in_project.groovy b/regression-test/suites/nereids_p0/subquery/test_subquery_in_project.groovy
index 25920848c6..32802dbb7a 100644
--- a/regression-test/suites/nereids_p0/subquery/test_subquery_in_project.groovy
+++ b/regression-test/suites/nereids_p0/subquery/test_subquery_in_project.groovy
@@ -137,4 +137,59 @@ suite("test_subquery_in_project") {
     """
 
     sql """drop table if exists test_sql;"""
+
+    sql """drop table if exists markjoin_t1;"""
+    sql """drop table if exists markjoin_t2;"""
+    sql """drop table if exists markjoin_t3;"""
+
+    sql """create table markjoin_t1
+                    (a bigint, b bigint)
+                    ENGINE=OLAP
+            DUPLICATE KEY(a, b)
+            COMMENT 'OLAP'
+            DISTRIBUTED BY HASH(a) BUCKETS 1
+            PROPERTIES (
+            "replication_num" = "1"
+            );"""
+    sql """create table markjoin_t2
+                    (a int, b varchar(128), c bigint, v1 bigint, v2 bigint)
+                    ENGINE=OLAP
+            DUPLICATE KEY(a, b)
+            COMMENT 'OLAP'
+            DISTRIBUTED BY HASH(a) BUCKETS 1
+            PROPERTIES (
+            "replication_num" = "1"
+            );"""
+
+    sql """insert into markjoin_t1 values (1,null),(null,1),(1,2), (null,2),(1,3), (2,4), (2,5), (3,3), (3,4), (20,2), (22,3), (24,4),(null,null);"""
+    sql """insert into markjoin_t2 values (1,'abc',2,3,4), (1,'abcd',3,3,4), (2,'xyz',2,4,2), (2,'uvw',3,4,2), (2,'uvw',3,4,2), (3,'abc',4,5,3), (3,'abc',4,5,3), (null,null,null,null,null);"""
+    
+    qt_select_m1 """SELECT markjoin_t1.b IN
+                    (SELECT markjoin_t2.c FROM markjoin_t2 WHERE markjoin_t1.a = markjoin_t2.a) 
+                    FROM markjoin_t1 order by 1;"""
+    qt_select_m2 """SELECT EXISTS 
+                    (SELECT markjoin_t2.c FROM markjoin_t2 WHERE markjoin_t1.a = markjoin_t2.a) 
+                    FROM markjoin_t1 order by 1;"""
+    qt_select_m3 """SELECT  markjoin_t1.b NOT IN
+                    (SELECT  markjoin_t2.c FROM markjoin_t2 WHERE markjoin_t1.a = markjoin_t2.a) 
+                    FROM markjoin_t1 order by 1;"""
+    qt_select_m4 """SELECT NOT EXISTS
+                    (SELECT  markjoin_t2.c FROM markjoin_t2 WHERE markjoin_t1.a = markjoin_t2.a) 
+                    FROM markjoin_t1 order by 1;"""
+    qt_select_m5 """SELECT  markjoin_t1.b IN 
+                    (SELECT  markjoin_t2.c FROM markjoin_t2) 
+                    FROM markjoin_t1 order by 1;"""
+    qt_select_m6 """SELECT EXISTS 
+                    (SELECT markjoin_t2.c FROM markjoin_t2) 
+                    FROM markjoin_t1 order by 1;"""
+    qt_select_m7 """SELECT markjoin_t1.b NOT IN 
+                    (SELECT markjoin_t2.c FROM markjoin_t2) 
+                    FROM markjoin_t1 order by 1;"""
+    qt_select_m8 """SELECT NOT EXISTS 
+                    (SELECT markjoin_t2.c FROM markjoin_t2) 
+                    FROM markjoin_t1 order by 1;"""
+    sql """drop table if exists markjoin_t1;"""
+    sql """drop table if exists markjoin_t2;"""
+    sql """drop table if exists markjoin_t3;"""
+
 }