Files
doris/be/src/exec/hash_join_node_ir.cpp
2020-10-20 09:28:57 +08:00

154 lines
5.6 KiB
C++

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "exec/hash_join_node.h"
#include "exec/hash_table.hpp"
#include "runtime/row_batch.h"
namespace doris {
// Functions in this file are cross compiled to IR with clang.
// Wrapper around ExecNode's eval conjuncts with a different function name.
// This lets us distinguish between the join conjuncts vs. non-join conjuncts
// for codegen.
// Note: don't declare this static. LLVM will pick the fastcc calling convention and
// we will not be able to replace the functions with codegen'd versions.
// TODO: explicitly set the calling convention?
// TODO: investigate using fastcc for all codegen internal functions?
bool IR_NO_INLINE eval_other_join_conjuncts(ExprContext* const* ctxs, int num_ctxs, TupleRow* row) {
return ExecNode::eval_conjuncts(ctxs, num_ctxs, row);
}
// CreateOutputRow, EvalOtherJoinConjuncts, and EvalConjuncts are replaced by
// codegen.
int HashJoinNode::process_probe_batch(RowBatch* out_batch, RowBatch* probe_batch,
int max_added_rows) {
// This path does not handle full outer or right outer joins
DCHECK(!_match_all_build);
int row_idx = out_batch->add_rows(max_added_rows);
DCHECK(row_idx != RowBatch::INVALID_ROW_INDEX);
uint8_t* out_row_mem = reinterpret_cast<uint8_t*>(out_batch->get_row(row_idx));
TupleRow* out_row = reinterpret_cast<TupleRow*>(out_row_mem);
int rows_returned = 0;
int probe_rows = probe_batch->num_rows();
ExprContext* const* other_conjunct_ctxs = &_other_join_conjunct_ctxs[0];
int num_other_conjunct_ctxs = _other_join_conjunct_ctxs.size();
ExprContext* const* conjunct_ctxs = &_conjunct_ctxs[0];
int num_conjunct_ctxs = _conjunct_ctxs.size();
while (true) {
// Create output row for each matching build row
while (_hash_tbl_iterator.has_next()) {
TupleRow* matched_build_row = _hash_tbl_iterator.get_row();
_hash_tbl_iterator.next<true>();
create_output_row(out_row, _current_probe_row, matched_build_row);
if (!eval_other_join_conjuncts(other_conjunct_ctxs, num_other_conjunct_ctxs, out_row)) {
continue;
}
_matched_probe = true;
// left_anti_join: equal match won't return
if (_join_op == TJoinOp::LEFT_ANTI_JOIN) {
_hash_tbl_iterator = _hash_tbl->end();
break;
}
if (eval_conjuncts(conjunct_ctxs, num_conjunct_ctxs, out_row)) {
++rows_returned;
// Filled up out batch or hit limit
if (UNLIKELY(rows_returned == max_added_rows)) {
goto end;
}
// Advance to next out row
out_row_mem += out_batch->row_byte_size();
out_row = reinterpret_cast<TupleRow*>(out_row_mem);
}
// Handle left semi-join
if (_match_one_build) {
_hash_tbl_iterator = _hash_tbl->end();
break;
}
}
// Handle left outer-join and left semi-join
if ((!_matched_probe && _match_all_probe) ||
((!_matched_probe && _join_op == TJoinOp::LEFT_ANTI_JOIN))) {
create_output_row(out_row, _current_probe_row, NULL);
_matched_probe = true;
if (ExecNode::eval_conjuncts(conjunct_ctxs, num_conjunct_ctxs, out_row)) {
++rows_returned;
if (UNLIKELY(rows_returned == max_added_rows)) {
goto end;
}
// Advance to next out row
out_row_mem += out_batch->row_byte_size();
out_row = reinterpret_cast<TupleRow*>(out_row_mem);
}
}
if (!_hash_tbl_iterator.has_next()) {
// Advance to the next probe row
if (UNLIKELY(_probe_batch_pos == probe_rows)) {
goto end;
}
_current_probe_row = probe_batch->get_row(_probe_batch_pos++);
_hash_tbl_iterator = _hash_tbl->find(_current_probe_row);
_matched_probe = false;
}
}
end:
if (_match_one_build && _matched_probe) {
_hash_tbl_iterator = _hash_tbl->end();
}
out_batch->commit_rows(rows_returned);
return rows_returned;
}
// when build table has too many duplicated rows, the collisions will be very serious,
// so in some case will don't need to store duplicated value in hash table, we can build an unique one
void HashJoinNode::process_build_batch(RowBatch* build_batch) {
// insert build row into our hash table
if (_build_unique) {
for (int i = 0; i < build_batch->num_rows(); ++i) {
_hash_tbl->insert_unique(build_batch->get_row(i));
}
} else {
for (int i = 0; i < build_batch->num_rows(); ++i) {
_hash_tbl->insert(build_batch->get_row(i));
}
}
}
} // namespace doris