baidu palo
This commit is contained in:
231
be/src/codegen/subexpr_elimination.cpp
Normal file
231
be/src/codegen/subexpr_elimination.cpp
Normal file
@ -0,0 +1,231 @@
|
||||
// Modifications copyright (C) 2017, Baidu.com, Inc.
|
||||
// Copyright 2017 The Apache Software Foundation
|
||||
|
||||
// Licensed to the Apache Software Foundation (ASF) under one
|
||||
// or more contributor license agreements. See the NOTICE file
|
||||
// distributed with this work for additional information
|
||||
// regarding copyright ownership. The ASF licenses this file
|
||||
// to you under the Apache License, Version 2.0 (the
|
||||
// "License"); you may not use this file except in compliance
|
||||
// with the License. You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing,
|
||||
// software distributed under the License is distributed on an
|
||||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
// KIND, either express or implied. See the License for the
|
||||
// specific language governing permissions and limitations
|
||||
// under the License.
|
||||
|
||||
#include "codegen/subexpr_elimination.h"
|
||||
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
|
||||
#include <boost/thread/mutex.hpp>
|
||||
#include <llvm/Analysis/Dominators.h>
|
||||
#include <llvm/Analysis/Passes.h>
|
||||
#include <llvm/Analysis/InstructionSimplify.h>
|
||||
#include <llvm/Support/DynamicLibrary.h>
|
||||
#include <llvm/IRReader/IRReader.h>
|
||||
#include <llvm/Support/MemoryBuffer.h>
|
||||
#include <llvm/Support/InstIterator.h>
|
||||
#include <llvm/Support/NoFolder.h>
|
||||
#include <llvm/Support/TargetSelect.h>
|
||||
#include <llvm/Support/raw_ostream.h>
|
||||
#include <llvm/Support/system_error.h>
|
||||
#include "llvm/Transforms/IPO.h"
|
||||
#include <llvm/Transforms/Scalar.h>
|
||||
#include <llvm/Transforms/Utils/SSAUpdater.h>
|
||||
|
||||
#include "common/logging.h"
|
||||
#include "codegen/subexpr_elimination.h"
|
||||
#include "palo_ir/palo_ir_names.h"
|
||||
#include "util/cpu_info.h"
|
||||
#include "util/path_builder.h"
|
||||
|
||||
using llvm::CallInst;
|
||||
using llvm::BitCastInst;
|
||||
using llvm::Instruction;
|
||||
using llvm::LoadInst;
|
||||
using llvm::StoreInst;
|
||||
using llvm::Function;
|
||||
using llvm::Value;
|
||||
using llvm::DominatorTree;
|
||||
namespace palo {
|
||||
|
||||
SubExprElimination::SubExprElimination(LlvmCodeGen* codegen) : _codegen(codegen) {
|
||||
}
|
||||
|
||||
// Before running the standard llvm optimization passes, first remove redundant calls
|
||||
// to slotref expression. SlotRefs are more heavyweight due to the null handling that
|
||||
// is required and after they are inlined, llvm is unable to eliminate the redundant
|
||||
// inlined code blocks.
|
||||
// For example:
|
||||
// select colA + colA would generate an inner loop with 2 calls to the colA slot ref,
|
||||
// rather than doing subexpression elimination. To handle this, we will:
|
||||
// 1. inline all call sites in the original function except calls to SlotRefs
|
||||
// 2. for all call sites to SlotRefs except the first to that SlotRef, replace the
|
||||
// results from the secondary calls with the result from the first and remove
|
||||
// the call instruction.
|
||||
// 3. Inline calls to the SlotRefs (there should only be one for each slot ref).
|
||||
//
|
||||
// In the above example, the input function would look something like:
|
||||
// int ArithmeticAdd(TupleRow* row, bool* is_null) {
|
||||
// bool lhs_is_null, rhs_is_null;
|
||||
// int lhs_value = SlotRef(row, &lhs_is_null);
|
||||
// if (lhs_is_null) { *is_null = true; return 0; }
|
||||
// int rhs_value = SlotRef(row, &rhs_is_null);
|
||||
// if (rhs_is_null) { *is_null = true; return 0; }
|
||||
// *is_null = false; return lhs_value + rhs_value;
|
||||
// }
|
||||
// During step 2, we'd substitute the second call to SlotRef with the results from
|
||||
// the first call.
|
||||
// int ArithmeticAdd(TupleRow* row, bool* is_null) {
|
||||
// bool lhs_is_null, rhs_is_null;
|
||||
// int lhs_value = SlotRef(row, &lhs_is_null);
|
||||
// if (lhs_is_null) { *is_null = true; return 0; }
|
||||
// int rhs_value = lhs_value;
|
||||
// rhs_is_null = lhs_is_null;
|
||||
// if (rhs_is_null) { *is_null = true; return 0; }
|
||||
// *is_null = false; return lhs_value + rhs_value;
|
||||
// }
|
||||
// And then rely on llvm to finish the removing the redundant code, resulting in:
|
||||
// int ArithmeticAdd(TupleRow* row, bool* is_null) {
|
||||
// bool lhs_is_null, rhs_is_null;
|
||||
// int lhs_value = SlotRef(row, &lhs_is_null);
|
||||
// if (lhs_is_null) { *is_null = true; return 0; }
|
||||
// *is_null = false; return lhs_value + lhs_value;
|
||||
// }
|
||||
// Details on how to do this:
|
||||
// http://llvm.org/docs/ProgrammersManual.html#replacing-an-instruction-with-another-value
|
||||
|
||||
// Step 2 requires more manipulation to ensure the resulting IR is still valid IR.
|
||||
// The call to the expr returns two things, both of which need to be replaced.
|
||||
// The value of the function as the return argument and whether or not the result was
|
||||
// null as a function output argument.
|
||||
// 1. The return value is trivial since with SSA, it is easy to identity all uses of
|
||||
// We simply replace the subsequent call instructions with the value.
|
||||
// 2. For the is_null result ptr, we replace the call to the expr with a store
|
||||
// instruction of the cached value.
|
||||
// i.e:
|
||||
// val1 = Call(is_null_ptr);
|
||||
// is_null1 = *is_null_ptr
|
||||
// ...
|
||||
// val2 = Call(is_null_ptr);
|
||||
// is_null2 = *is_null_ptr
|
||||
// Becomes:
|
||||
// val1 = Call(is_null_ptr);
|
||||
// is_null1 = *is_null_ptr
|
||||
// ...
|
||||
// val2 = val1;
|
||||
// *is_null_ptr = is_null1;
|
||||
// is_null2 = *is_null_ptr
|
||||
// We do this because the is_null ptr is not SSA form, making manipulating it
|
||||
// complex. The above approach exactly preserves the Call function, including
|
||||
// all writes to ptrs. We then rely on the llvm load/store removal pass which
|
||||
// will remove the redundant loads (which is tricky since you have to track
|
||||
// other instructions that wrote to the ptr, etc).
|
||||
// When doing the eliminations, we need to consider the call graph to make sure
|
||||
// the instruction we are replacing with dominates the instruction we are replacing;
|
||||
// that is, we need to guarantee the instruction we are replacing with always executes
|
||||
// before the replacee instruction in all code paths.
|
||||
// TODO: remove all this with expr refactoring. Everything will be SSA form then.
|
||||
struct CachedExprResult {
|
||||
// First function call result. Subsequent calls will be replaced with this value
|
||||
CallInst* result;
|
||||
// First is null result. Subsequent calls will be replaced with this value.
|
||||
Instruction* is_null_value;
|
||||
};
|
||||
|
||||
bool SubExprElimination::run(Function* fn) {
|
||||
// Step 1:
|
||||
int num_inlined = 0;
|
||||
do {
|
||||
// This assumes that all redundant exprs have been registered.
|
||||
num_inlined = _codegen->inline_call_sites(fn, true);
|
||||
} while (num_inlined > 0);
|
||||
|
||||
// Mapping of (expr eval function, its 'row' arg) to cached result. We want to remove
|
||||
// redundant calls to the same function with the same argument.
|
||||
std::map<std::pair<Function*, Value*>, CachedExprResult> cached_slot_ref_results;
|
||||
|
||||
// Step 2:
|
||||
DominatorTree dom_tree;
|
||||
dom_tree.runOnFunction(*fn);
|
||||
|
||||
llvm::inst_iterator fn_end = llvm::inst_end(fn);
|
||||
llvm::inst_iterator instr_iter = llvm::inst_begin(fn);
|
||||
// Loop over every instruction in the function.
|
||||
while (instr_iter != fn_end) {
|
||||
Instruction* instr = &*instr_iter;
|
||||
++instr_iter;
|
||||
// Look for call instructions
|
||||
if (!CallInst::classof(instr)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
CallInst* call_instr = reinterpret_cast<CallInst*>(instr);
|
||||
Function* called_fn = call_instr->getCalledFunction();
|
||||
if (_codegen->_registered_exprs.find(called_fn) ==
|
||||
_codegen->_registered_exprs.end()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Found a registered expr function. We generate the IR in a very specific way
|
||||
// when calling the expr. The call instruction is always followed by loading the
|
||||
// resulting is_null result. We need to update both.
|
||||
// TODO: we need to update this to do more analysis since we are relying on a very
|
||||
// specific code structure to do this.
|
||||
|
||||
// Arguments are (row, scratch_buffer, is_null);
|
||||
DCHECK_EQ(call_instr->getNumArgOperands(), 3);
|
||||
Value* row_arg = call_instr->getArgOperand(0);
|
||||
|
||||
DCHECK(BitCastInst::classof(row_arg));
|
||||
BitCastInst* row_cast = reinterpret_cast<BitCastInst*>(row_arg);
|
||||
// Get at the underlying row arg. We need to differentiate between
|
||||
// call Fn(row1) and call Fn(row2). (identical fns but different input).
|
||||
row_arg = row_cast->getOperand(0);
|
||||
|
||||
instr = &*instr_iter;
|
||||
++instr_iter;
|
||||
|
||||
if (!LoadInst::classof(instr)) {
|
||||
continue;
|
||||
}
|
||||
LoadInst* is_null_value = reinterpret_cast<LoadInst*>(instr);
|
||||
Value* loaded_ptr = is_null_value->getPointerOperand();
|
||||
|
||||
// Subexpr elimination requires the IR to be a very specific form.
|
||||
// call SlotRef(row, NULL, is_null_ptr)
|
||||
// load is_null_ptr
|
||||
// Since we generate this IR currently, we can enforce this logic in our exprs
|
||||
// TODO: this should be removed/generalized with expr refactoring
|
||||
DCHECK_EQ(loaded_ptr, call_instr->getArgOperand(2));
|
||||
|
||||
std::pair<Function*, Value*> call_desc = std::make_pair(called_fn, row_arg);
|
||||
if (cached_slot_ref_results.find(call_desc) == cached_slot_ref_results.end()) {
|
||||
CachedExprResult cache_entry;
|
||||
cache_entry.result = call_instr;
|
||||
cache_entry.is_null_value = is_null_value;
|
||||
cached_slot_ref_results[call_desc] = cache_entry;
|
||||
} else {
|
||||
// Reuse the result.
|
||||
CachedExprResult& cache_entry = cached_slot_ref_results[call_desc];
|
||||
if (dom_tree.dominates(cache_entry.result, call_instr)) {
|
||||
new StoreInst(cache_entry.is_null_value, loaded_ptr, call_instr);
|
||||
call_instr->replaceAllUsesWith(cache_entry.result);
|
||||
call_instr->eraseFromParent();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Step 3:
|
||||
_codegen->inline_call_sites(fn, false);
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
Reference in New Issue
Block a user