diff --git a/be/src/exec/hash_join_node.cpp b/be/src/exec/hash_join_node.cpp index c31e2ee462..d03dd3e49c 100644 --- a/be/src/exec/hash_join_node.cpp +++ b/be/src/exec/hash_join_node.cpp @@ -44,6 +44,9 @@ HashJoinNode::HashJoinNode( _match_all_build = (_join_op == TJoinOp::RIGHT_OUTER_JOIN || _join_op == TJoinOp::FULL_OUTER_JOIN); _is_push_down = tnode.hash_join_node.is_push_down; + _build_unique = _join_op == TJoinOp::LEFT_ANTI_JOIN|| _join_op == TJoinOp::RIGHT_ANTI_JOIN + || _join_op == TJoinOp::RIGHT_SEMI_JOIN || _join_op == TJoinOp::LEFT_SEMI_JOIN + || _join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN } HashJoinNode::~HashJoinNode() { diff --git a/be/src/exec/hash_join_node.h b/be/src/exec/hash_join_node.h index f4b703b7e8..eaa04879cc 100644 --- a/be/src/exec/hash_join_node.h +++ b/be/src/exec/hash_join_node.h @@ -88,6 +88,7 @@ private: bool _match_all_probe; // output all rows coming from the probe input bool _match_one_build; // match at most one build row to each probe row bool _match_all_build; // output all rows coming from the build input + bool _build_unique; // build a hash table without duplicated rows bool _matched_probe; // if true, we have matched the current probe row bool _eos; // if true, nothing left to return in get_next() diff --git a/be/src/exec/hash_join_node_ir.cpp b/be/src/exec/hash_join_node_ir.cpp index e7fec77962..518eb767fa 100644 --- a/be/src/exec/hash_join_node_ir.cpp +++ b/be/src/exec/hash_join_node_ir.cpp @@ -30,15 +30,14 @@ namespace doris { // we will not be able to replace the funcitons with codegen'd versions. // TODO: explicitly set the calling convention? // TODO: investigate using fastcc for all codegen internal functions? -bool IR_NO_INLINE eval_other_join_conjuncts( - ExprContext* const* ctxs, int num_ctxs, TupleRow* row) { +bool IR_NO_INLINE eval_other_join_conjuncts(ExprContext* const* ctxs, int num_ctxs, TupleRow* row) { return ExecNode::eval_conjuncts(ctxs, num_ctxs, row); } // CreateOutputRow, EvalOtherJoinConjuncts, and EvalConjuncts are replaced by // codegen. int HashJoinNode::process_probe_batch(RowBatch* out_batch, RowBatch* probe_batch, - int max_added_rows) { + int max_added_rows) { // This path does not handle full outer or right outer joins DCHECK(!_match_all_build); @@ -63,8 +62,7 @@ int HashJoinNode::process_probe_batch(RowBatch* out_batch, RowBatch* probe_batch _hash_tbl_iterator.next(); create_output_row(out_row, _current_probe_row, matched_build_row); - if (!eval_other_join_conjuncts( - other_conjunct_ctxs, num_other_conjunct_ctxs, out_row)) { + if (!eval_other_join_conjuncts(other_conjunct_ctxs, num_other_conjunct_ctxs, out_row)) { continue; } @@ -72,7 +70,7 @@ int HashJoinNode::process_probe_batch(RowBatch* out_batch, RowBatch* probe_batch // left_anti_join: equal match won't return if (_join_op == TJoinOp::LEFT_ANTI_JOIN) { - _hash_tbl_iterator= _hash_tbl->end(); + _hash_tbl_iterator = _hash_tbl->end(); break; } @@ -95,10 +93,10 @@ int HashJoinNode::process_probe_batch(RowBatch* out_batch, RowBatch* probe_batch break; } } - + // Handle left outer-join and left semi-join - if ((!_matched_probe && _match_all_probe) || - ((!_matched_probe && _join_op == TJoinOp::LEFT_ANTI_JOIN))) { + if ((!_matched_probe && _match_all_probe) || + ((!_matched_probe && _join_op == TJoinOp::LEFT_ANTI_JOIN))) { create_output_row(out_row, _current_probe_row, NULL); _matched_probe = true; @@ -137,11 +135,19 @@ end: return rows_returned; } +// when build table has too many duplicated rows, the collisions will be very serious, +// so in some case will don't need to store duplicated value in hash table, we can build an unique one void HashJoinNode::process_build_batch(RowBatch* build_batch) { // insert build row into our hash table - for (int i = 0; i < build_batch->num_rows(); ++i) { - _hash_tbl->insert(build_batch->get_row(i)); + if (_build_unique) { + for (int i = 0; i < build_batch->num_rows(); ++i) { + _hash_tbl->insert_unique(build_batch->get_row(i)); + } + } else { + for (int i = 0; i < build_batch->num_rows(); ++i) { + _hash_tbl->insert(build_batch->get_row(i)); + } } } -} +} // namespace doris