From f0db9272ddffbc367748ce0f1040bc05dd16f5f1 Mon Sep 17 00:00:00 2001 From: yangzhg <780531911@qq.com> Date: Fri, 20 Mar 2020 10:31:14 +0800 Subject: [PATCH] [Performance] Improve performence of hash join in some case (#3148) improve performent of hash join when build table has to many duplicated rows, this will cause hash table collisions and slow down the probe performence. In this pr when join type is semi join or anti join, we will build a hash table without duplicated rows. benchmark: dataset: tpcds dataset `store_sales` and `catalog_sales` ``` mysql> select count(*) from catalog_sales; +----------+ | count(*) | +----------+ | 14401261 | +----------+ 1 row in set (0.44 sec) mysql> select count(distinct cs_bill_cdemo_sk) from catalog_sales; +------------------------------------+ | count(DISTINCT `cs_bill_cdemo_sk`) | +------------------------------------+ | 1085080 | +------------------------------------+ 1 row in set (2.46 sec) mysql> select count(*) from store_sales; +----------+ | count(*) | +----------+ | 28800991 | +----------+ 1 row in set (0.84 sec) mysql> select count(distinct ss_addr_sk) from store_sales; +------------------------------+ | count(DISTINCT `ss_addr_sk`) | +------------------------------+ | 249978 | +------------------------------+ 1 row in set (2.57 sec) ``` test querys: query1: `select count(*) from (select store_sales.ss_addr_sk from store_sales left semi join catalog_sales on catalog_sales.cs_bill_cdemo_sk = store_sales.ss_addr_sk) a;` query2: `select count(*) from (select catalog_sales.cs_bill_cdemo_sk from catalog_sales left semi join store_sales on catalog_sales.cs_bill_cdemo_sk = store_sales.ss_addr_sk) a;` benchmark result: ||query1|query2| |:--:|:--:|:--:| |before|14.76 sec|3 min 16.52 sec| |after|12.64 sec|10.34 sec| --- be/src/exec/hash_join_node.cpp | 3 +++ be/src/exec/hash_join_node.h | 1 + be/src/exec/hash_join_node_ir.cpp | 30 ++++++++++++++++++------------ 3 files changed, 22 insertions(+), 12 deletions(-) diff --git a/be/src/exec/hash_join_node.cpp b/be/src/exec/hash_join_node.cpp index c31e2ee462..d03dd3e49c 100644 --- a/be/src/exec/hash_join_node.cpp +++ b/be/src/exec/hash_join_node.cpp @@ -44,6 +44,9 @@ HashJoinNode::HashJoinNode( _match_all_build = (_join_op == TJoinOp::RIGHT_OUTER_JOIN || _join_op == TJoinOp::FULL_OUTER_JOIN); _is_push_down = tnode.hash_join_node.is_push_down; + _build_unique = _join_op == TJoinOp::LEFT_ANTI_JOIN|| _join_op == TJoinOp::RIGHT_ANTI_JOIN + || _join_op == TJoinOp::RIGHT_SEMI_JOIN || _join_op == TJoinOp::LEFT_SEMI_JOIN + || _join_op == TJoinOp::NULL_AWARE_LEFT_ANTI_JOIN } HashJoinNode::~HashJoinNode() { diff --git a/be/src/exec/hash_join_node.h b/be/src/exec/hash_join_node.h index f4b703b7e8..eaa04879cc 100644 --- a/be/src/exec/hash_join_node.h +++ b/be/src/exec/hash_join_node.h @@ -88,6 +88,7 @@ private: bool _match_all_probe; // output all rows coming from the probe input bool _match_one_build; // match at most one build row to each probe row bool _match_all_build; // output all rows coming from the build input + bool _build_unique; // build a hash table without duplicated rows bool _matched_probe; // if true, we have matched the current probe row bool _eos; // if true, nothing left to return in get_next() diff --git a/be/src/exec/hash_join_node_ir.cpp b/be/src/exec/hash_join_node_ir.cpp index e7fec77962..518eb767fa 100644 --- a/be/src/exec/hash_join_node_ir.cpp +++ b/be/src/exec/hash_join_node_ir.cpp @@ -30,15 +30,14 @@ namespace doris { // we will not be able to replace the funcitons with codegen'd versions. // TODO: explicitly set the calling convention? // TODO: investigate using fastcc for all codegen internal functions? -bool IR_NO_INLINE eval_other_join_conjuncts( - ExprContext* const* ctxs, int num_ctxs, TupleRow* row) { +bool IR_NO_INLINE eval_other_join_conjuncts(ExprContext* const* ctxs, int num_ctxs, TupleRow* row) { return ExecNode::eval_conjuncts(ctxs, num_ctxs, row); } // CreateOutputRow, EvalOtherJoinConjuncts, and EvalConjuncts are replaced by // codegen. int HashJoinNode::process_probe_batch(RowBatch* out_batch, RowBatch* probe_batch, - int max_added_rows) { + int max_added_rows) { // This path does not handle full outer or right outer joins DCHECK(!_match_all_build); @@ -63,8 +62,7 @@ int HashJoinNode::process_probe_batch(RowBatch* out_batch, RowBatch* probe_batch _hash_tbl_iterator.next(); create_output_row(out_row, _current_probe_row, matched_build_row); - if (!eval_other_join_conjuncts( - other_conjunct_ctxs, num_other_conjunct_ctxs, out_row)) { + if (!eval_other_join_conjuncts(other_conjunct_ctxs, num_other_conjunct_ctxs, out_row)) { continue; } @@ -72,7 +70,7 @@ int HashJoinNode::process_probe_batch(RowBatch* out_batch, RowBatch* probe_batch // left_anti_join: equal match won't return if (_join_op == TJoinOp::LEFT_ANTI_JOIN) { - _hash_tbl_iterator= _hash_tbl->end(); + _hash_tbl_iterator = _hash_tbl->end(); break; } @@ -95,10 +93,10 @@ int HashJoinNode::process_probe_batch(RowBatch* out_batch, RowBatch* probe_batch break; } } - + // Handle left outer-join and left semi-join - if ((!_matched_probe && _match_all_probe) || - ((!_matched_probe && _join_op == TJoinOp::LEFT_ANTI_JOIN))) { + if ((!_matched_probe && _match_all_probe) || + ((!_matched_probe && _join_op == TJoinOp::LEFT_ANTI_JOIN))) { create_output_row(out_row, _current_probe_row, NULL); _matched_probe = true; @@ -137,11 +135,19 @@ end: return rows_returned; } +// when build table has too many duplicated rows, the collisions will be very serious, +// so in some case will don't need to store duplicated value in hash table, we can build an unique one void HashJoinNode::process_build_batch(RowBatch* build_batch) { // insert build row into our hash table - for (int i = 0; i < build_batch->num_rows(); ++i) { - _hash_tbl->insert(build_batch->get_row(i)); + if (_build_unique) { + for (int i = 0; i < build_batch->num_rows(); ++i) { + _hash_tbl->insert_unique(build_batch->get_row(i)); + } + } else { + for (int i = 0; i < build_batch->num_rows(); ++i) { + _hash_tbl->insert(build_batch->get_row(i)); + } } } -} +} // namespace doris