229 lines
9.5 KiB
C++
229 lines
9.5 KiB
C++
// Licensed to the Apache Software Foundation (ASF) under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing,
|
|
// software distributed under the License is distributed on an
|
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
// KIND, either express or implied. See the License for the
|
|
// specific language governing permissions and limitations
|
|
// under the License.
|
|
|
|
#include "codegen/subexpr_elimination.h"
|
|
|
|
#include <fstream>
|
|
#include <iostream>
|
|
#include <sstream>
|
|
|
|
#include <boost/thread/mutex.hpp>
|
|
#include <llvm/Analysis/Dominators.h>
|
|
#include <llvm/Analysis/Passes.h>
|
|
#include <llvm/Analysis/InstructionSimplify.h>
|
|
#include <llvm/Support/DynamicLibrary.h>
|
|
#include <llvm/IRReader/IRReader.h>
|
|
#include <llvm/Support/MemoryBuffer.h>
|
|
#include <llvm/Support/InstIterator.h>
|
|
#include <llvm/Support/NoFolder.h>
|
|
#include <llvm/Support/TargetSelect.h>
|
|
#include <llvm/Support/raw_ostream.h>
|
|
#include <llvm/Support/system_error.h>
|
|
#include "llvm/Transforms/IPO.h"
|
|
#include <llvm/Transforms/Scalar.h>
|
|
#include <llvm/Transforms/Utils/SSAUpdater.h>
|
|
|
|
#include "common/logging.h"
|
|
#include "codegen/subexpr_elimination.h"
|
|
#include "doris_ir/doris_ir_names.h"
|
|
#include "util/cpu_info.h"
|
|
#include "util/path_builder.h"
|
|
|
|
using llvm::CallInst;
|
|
using llvm::BitCastInst;
|
|
using llvm::Instruction;
|
|
using llvm::LoadInst;
|
|
using llvm::StoreInst;
|
|
using llvm::Function;
|
|
using llvm::Value;
|
|
using llvm::DominatorTree;
|
|
namespace doris {
|
|
|
|
SubExprElimination::SubExprElimination(LlvmCodeGen* codegen) : _codegen(codegen) {
|
|
}
|
|
|
|
// Before running the standard llvm optimization passes, first remove redundant calls
|
|
// to slotref expression. SlotRefs are more heavyweight due to the null handling that
|
|
// is required and after they are inlined, llvm is unable to eliminate the redundant
|
|
// inlined code blocks.
|
|
// For example:
|
|
// select colA + colA would generate an inner loop with 2 calls to the colA slot ref,
|
|
// rather than doing subexpression elimination. To handle this, we will:
|
|
// 1. inline all call sites in the original function except calls to SlotRefs
|
|
// 2. for all call sites to SlotRefs except the first to that SlotRef, replace the
|
|
// results from the secondary calls with the result from the first and remove
|
|
// the call instruction.
|
|
// 3. Inline calls to the SlotRefs (there should only be one for each slot ref).
|
|
//
|
|
// In the above example, the input function would look something like:
|
|
// int ArithmeticAdd(TupleRow* row, bool* is_null) {
|
|
// bool lhs_is_null, rhs_is_null;
|
|
// int lhs_value = SlotRef(row, &lhs_is_null);
|
|
// if (lhs_is_null) { *is_null = true; return 0; }
|
|
// int rhs_value = SlotRef(row, &rhs_is_null);
|
|
// if (rhs_is_null) { *is_null = true; return 0; }
|
|
// *is_null = false; return lhs_value + rhs_value;
|
|
// }
|
|
// During step 2, we'd substitute the second call to SlotRef with the results from
|
|
// the first call.
|
|
// int ArithmeticAdd(TupleRow* row, bool* is_null) {
|
|
// bool lhs_is_null, rhs_is_null;
|
|
// int lhs_value = SlotRef(row, &lhs_is_null);
|
|
// if (lhs_is_null) { *is_null = true; return 0; }
|
|
// int rhs_value = lhs_value;
|
|
// rhs_is_null = lhs_is_null;
|
|
// if (rhs_is_null) { *is_null = true; return 0; }
|
|
// *is_null = false; return lhs_value + rhs_value;
|
|
// }
|
|
// And then rely on llvm to finish the removing the redundant code, resulting in:
|
|
// int ArithmeticAdd(TupleRow* row, bool* is_null) {
|
|
// bool lhs_is_null, rhs_is_null;
|
|
// int lhs_value = SlotRef(row, &lhs_is_null);
|
|
// if (lhs_is_null) { *is_null = true; return 0; }
|
|
// *is_null = false; return lhs_value + lhs_value;
|
|
// }
|
|
// Details on how to do this:
|
|
// http://llvm.org/docs/ProgrammersManual.html#replacing-an-instruction-with-another-value
|
|
|
|
// Step 2 requires more manipulation to ensure the resulting IR is still valid IR.
|
|
// The call to the expr returns two things, both of which need to be replaced.
|
|
// The value of the function as the return argument and whether or not the result was
|
|
// null as a function output argument.
|
|
// 1. The return value is trivial since with SSA, it is easy to identity all uses of
|
|
// We simply replace the subsequent call instructions with the value.
|
|
// 2. For the is_null result ptr, we replace the call to the expr with a store
|
|
// instruction of the cached value.
|
|
// i.e:
|
|
// val1 = Call(is_null_ptr);
|
|
// is_null1 = *is_null_ptr
|
|
// ...
|
|
// val2 = Call(is_null_ptr);
|
|
// is_null2 = *is_null_ptr
|
|
// Becomes:
|
|
// val1 = Call(is_null_ptr);
|
|
// is_null1 = *is_null_ptr
|
|
// ...
|
|
// val2 = val1;
|
|
// *is_null_ptr = is_null1;
|
|
// is_null2 = *is_null_ptr
|
|
// We do this because the is_null ptr is not SSA form, making manipulating it
|
|
// complex. The above approach exactly preserves the Call function, including
|
|
// all writes to ptrs. We then rely on the llvm load/store removal pass which
|
|
// will remove the redundant loads (which is tricky since you have to track
|
|
// other instructions that wrote to the ptr, etc).
|
|
// When doing the eliminations, we need to consider the call graph to make sure
|
|
// the instruction we are replacing with dominates the instruction we are replacing;
|
|
// that is, we need to guarantee the instruction we are replacing with always executes
|
|
// before the replacee instruction in all code paths.
|
|
// TODO: remove all this with expr refactoring. Everything will be SSA form then.
|
|
struct CachedExprResult {
|
|
// First function call result. Subsequent calls will be replaced with this value
|
|
CallInst* result;
|
|
// First is null result. Subsequent calls will be replaced with this value.
|
|
Instruction* is_null_value;
|
|
};
|
|
|
|
bool SubExprElimination::run(Function* fn) {
|
|
// Step 1:
|
|
int num_inlined = 0;
|
|
do {
|
|
// This assumes that all redundant exprs have been registered.
|
|
num_inlined = _codegen->inline_call_sites(fn, true);
|
|
} while (num_inlined > 0);
|
|
|
|
// Mapping of (expr eval function, its 'row' arg) to cached result. We want to remove
|
|
// redundant calls to the same function with the same argument.
|
|
std::map<std::pair<Function*, Value*>, CachedExprResult> cached_slot_ref_results;
|
|
|
|
// Step 2:
|
|
DominatorTree dom_tree;
|
|
dom_tree.runOnFunction(*fn);
|
|
|
|
llvm::inst_iterator fn_end = llvm::inst_end(fn);
|
|
llvm::inst_iterator instr_iter = llvm::inst_begin(fn);
|
|
// Loop over every instruction in the function.
|
|
while (instr_iter != fn_end) {
|
|
Instruction* instr = &*instr_iter;
|
|
++instr_iter;
|
|
// Look for call instructions
|
|
if (!CallInst::classof(instr)) {
|
|
continue;
|
|
}
|
|
|
|
CallInst* call_instr = reinterpret_cast<CallInst*>(instr);
|
|
Function* called_fn = call_instr->getCalledFunction();
|
|
if (_codegen->_registered_exprs.find(called_fn) ==
|
|
_codegen->_registered_exprs.end()) {
|
|
continue;
|
|
}
|
|
|
|
// Found a registered expr function. We generate the IR in a very specific way
|
|
// when calling the expr. The call instruction is always followed by loading the
|
|
// resulting is_null result. We need to update both.
|
|
// TODO: we need to update this to do more analysis since we are relying on a very
|
|
// specific code structure to do this.
|
|
|
|
// Arguments are (row, scratch_buffer, is_null);
|
|
DCHECK_EQ(call_instr->getNumArgOperands(), 3);
|
|
Value* row_arg = call_instr->getArgOperand(0);
|
|
|
|
DCHECK(BitCastInst::classof(row_arg));
|
|
BitCastInst* row_cast = reinterpret_cast<BitCastInst*>(row_arg);
|
|
// Get at the underlying row arg. We need to differentiate between
|
|
// call Fn(row1) and call Fn(row2). (identical fns but different input).
|
|
row_arg = row_cast->getOperand(0);
|
|
|
|
instr = &*instr_iter;
|
|
++instr_iter;
|
|
|
|
if (!LoadInst::classof(instr)) {
|
|
continue;
|
|
}
|
|
LoadInst* is_null_value = reinterpret_cast<LoadInst*>(instr);
|
|
Value* loaded_ptr = is_null_value->getPointerOperand();
|
|
|
|
// Subexpr elimination requires the IR to be a very specific form.
|
|
// call SlotRef(row, NULL, is_null_ptr)
|
|
// load is_null_ptr
|
|
// Since we generate this IR currently, we can enforce this logic in our exprs
|
|
// TODO: this should be removed/generalized with expr refactoring
|
|
DCHECK_EQ(loaded_ptr, call_instr->getArgOperand(2));
|
|
|
|
std::pair<Function*, Value*> call_desc = std::make_pair(called_fn, row_arg);
|
|
if (cached_slot_ref_results.find(call_desc) == cached_slot_ref_results.end()) {
|
|
CachedExprResult cache_entry;
|
|
cache_entry.result = call_instr;
|
|
cache_entry.is_null_value = is_null_value;
|
|
cached_slot_ref_results[call_desc] = cache_entry;
|
|
} else {
|
|
// Reuse the result.
|
|
CachedExprResult& cache_entry = cached_slot_ref_results[call_desc];
|
|
if (dom_tree.dominates(cache_entry.result, call_instr)) {
|
|
new StoreInst(cache_entry.is_null_value, loaded_ptr, call_instr);
|
|
call_instr->replaceAllUsesWith(cache_entry.result);
|
|
call_instr->eraseFromParent();
|
|
}
|
|
}
|
|
}
|
|
|
|
// Step 3:
|
|
_codegen->inline_call_sites(fn, false);
|
|
return true;
|
|
}
|
|
|
|
}
|