baidu palo

This commit is contained in:
cyongli
2017-08-11 17:51:21 +08:00
commit e2311f656e
1988 changed files with 586941 additions and 0 deletions

View File

@ -0,0 +1,231 @@
// Modifications copyright (C) 2017, Baidu.com, Inc.
// Copyright 2017 The Apache Software Foundation
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "codegen/subexpr_elimination.h"
#include <fstream>
#include <iostream>
#include <sstream>
#include <boost/thread/mutex.hpp>
#include <llvm/Analysis/Dominators.h>
#include <llvm/Analysis/Passes.h>
#include <llvm/Analysis/InstructionSimplify.h>
#include <llvm/Support/DynamicLibrary.h>
#include <llvm/IRReader/IRReader.h>
#include <llvm/Support/MemoryBuffer.h>
#include <llvm/Support/InstIterator.h>
#include <llvm/Support/NoFolder.h>
#include <llvm/Support/TargetSelect.h>
#include <llvm/Support/raw_ostream.h>
#include <llvm/Support/system_error.h>
#include "llvm/Transforms/IPO.h"
#include <llvm/Transforms/Scalar.h>
#include <llvm/Transforms/Utils/SSAUpdater.h>
#include "common/logging.h"
#include "codegen/subexpr_elimination.h"
#include "palo_ir/palo_ir_names.h"
#include "util/cpu_info.h"
#include "util/path_builder.h"
using llvm::CallInst;
using llvm::BitCastInst;
using llvm::Instruction;
using llvm::LoadInst;
using llvm::StoreInst;
using llvm::Function;
using llvm::Value;
using llvm::DominatorTree;
namespace palo {
SubExprElimination::SubExprElimination(LlvmCodeGen* codegen) : _codegen(codegen) {
}
// Before running the standard llvm optimization passes, first remove redundant calls
// to slotref expression. SlotRefs are more heavyweight due to the null handling that
// is required and after they are inlined, llvm is unable to eliminate the redundant
// inlined code blocks.
// For example:
// select colA + colA would generate an inner loop with 2 calls to the colA slot ref,
// rather than doing subexpression elimination. To handle this, we will:
// 1. inline all call sites in the original function except calls to SlotRefs
// 2. for all call sites to SlotRefs except the first to that SlotRef, replace the
// results from the secondary calls with the result from the first and remove
// the call instruction.
// 3. Inline calls to the SlotRefs (there should only be one for each slot ref).
//
// In the above example, the input function would look something like:
// int ArithmeticAdd(TupleRow* row, bool* is_null) {
// bool lhs_is_null, rhs_is_null;
// int lhs_value = SlotRef(row, &lhs_is_null);
// if (lhs_is_null) { *is_null = true; return 0; }
// int rhs_value = SlotRef(row, &rhs_is_null);
// if (rhs_is_null) { *is_null = true; return 0; }
// *is_null = false; return lhs_value + rhs_value;
// }
// During step 2, we'd substitute the second call to SlotRef with the results from
// the first call.
// int ArithmeticAdd(TupleRow* row, bool* is_null) {
// bool lhs_is_null, rhs_is_null;
// int lhs_value = SlotRef(row, &lhs_is_null);
// if (lhs_is_null) { *is_null = true; return 0; }
// int rhs_value = lhs_value;
// rhs_is_null = lhs_is_null;
// if (rhs_is_null) { *is_null = true; return 0; }
// *is_null = false; return lhs_value + rhs_value;
// }
// And then rely on llvm to finish the removing the redundant code, resulting in:
// int ArithmeticAdd(TupleRow* row, bool* is_null) {
// bool lhs_is_null, rhs_is_null;
// int lhs_value = SlotRef(row, &lhs_is_null);
// if (lhs_is_null) { *is_null = true; return 0; }
// *is_null = false; return lhs_value + lhs_value;
// }
// Details on how to do this:
// http://llvm.org/docs/ProgrammersManual.html#replacing-an-instruction-with-another-value
// Step 2 requires more manipulation to ensure the resulting IR is still valid IR.
// The call to the expr returns two things, both of which need to be replaced.
// The value of the function as the return argument and whether or not the result was
// null as a function output argument.
// 1. The return value is trivial since with SSA, it is easy to identity all uses of
// We simply replace the subsequent call instructions with the value.
// 2. For the is_null result ptr, we replace the call to the expr with a store
// instruction of the cached value.
// i.e:
// val1 = Call(is_null_ptr);
// is_null1 = *is_null_ptr
// ...
// val2 = Call(is_null_ptr);
// is_null2 = *is_null_ptr
// Becomes:
// val1 = Call(is_null_ptr);
// is_null1 = *is_null_ptr
// ...
// val2 = val1;
// *is_null_ptr = is_null1;
// is_null2 = *is_null_ptr
// We do this because the is_null ptr is not SSA form, making manipulating it
// complex. The above approach exactly preserves the Call function, including
// all writes to ptrs. We then rely on the llvm load/store removal pass which
// will remove the redundant loads (which is tricky since you have to track
// other instructions that wrote to the ptr, etc).
// When doing the eliminations, we need to consider the call graph to make sure
// the instruction we are replacing with dominates the instruction we are replacing;
// that is, we need to guarantee the instruction we are replacing with always executes
// before the replacee instruction in all code paths.
// TODO: remove all this with expr refactoring. Everything will be SSA form then.
struct CachedExprResult {
// First function call result. Subsequent calls will be replaced with this value
CallInst* result;
// First is null result. Subsequent calls will be replaced with this value.
Instruction* is_null_value;
};
bool SubExprElimination::run(Function* fn) {
// Step 1:
int num_inlined = 0;
do {
// This assumes that all redundant exprs have been registered.
num_inlined = _codegen->inline_call_sites(fn, true);
} while (num_inlined > 0);
// Mapping of (expr eval function, its 'row' arg) to cached result. We want to remove
// redundant calls to the same function with the same argument.
std::map<std::pair<Function*, Value*>, CachedExprResult> cached_slot_ref_results;
// Step 2:
DominatorTree dom_tree;
dom_tree.runOnFunction(*fn);
llvm::inst_iterator fn_end = llvm::inst_end(fn);
llvm::inst_iterator instr_iter = llvm::inst_begin(fn);
// Loop over every instruction in the function.
while (instr_iter != fn_end) {
Instruction* instr = &*instr_iter;
++instr_iter;
// Look for call instructions
if (!CallInst::classof(instr)) {
continue;
}
CallInst* call_instr = reinterpret_cast<CallInst*>(instr);
Function* called_fn = call_instr->getCalledFunction();
if (_codegen->_registered_exprs.find(called_fn) ==
_codegen->_registered_exprs.end()) {
continue;
}
// Found a registered expr function. We generate the IR in a very specific way
// when calling the expr. The call instruction is always followed by loading the
// resulting is_null result. We need to update both.
// TODO: we need to update this to do more analysis since we are relying on a very
// specific code structure to do this.
// Arguments are (row, scratch_buffer, is_null);
DCHECK_EQ(call_instr->getNumArgOperands(), 3);
Value* row_arg = call_instr->getArgOperand(0);
DCHECK(BitCastInst::classof(row_arg));
BitCastInst* row_cast = reinterpret_cast<BitCastInst*>(row_arg);
// Get at the underlying row arg. We need to differentiate between
// call Fn(row1) and call Fn(row2). (identical fns but different input).
row_arg = row_cast->getOperand(0);
instr = &*instr_iter;
++instr_iter;
if (!LoadInst::classof(instr)) {
continue;
}
LoadInst* is_null_value = reinterpret_cast<LoadInst*>(instr);
Value* loaded_ptr = is_null_value->getPointerOperand();
// Subexpr elimination requires the IR to be a very specific form.
// call SlotRef(row, NULL, is_null_ptr)
// load is_null_ptr
// Since we generate this IR currently, we can enforce this logic in our exprs
// TODO: this should be removed/generalized with expr refactoring
DCHECK_EQ(loaded_ptr, call_instr->getArgOperand(2));
std::pair<Function*, Value*> call_desc = std::make_pair(called_fn, row_arg);
if (cached_slot_ref_results.find(call_desc) == cached_slot_ref_results.end()) {
CachedExprResult cache_entry;
cache_entry.result = call_instr;
cache_entry.is_null_value = is_null_value;
cached_slot_ref_results[call_desc] = cache_entry;
} else {
// Reuse the result.
CachedExprResult& cache_entry = cached_slot_ref_results[call_desc];
if (dom_tree.dominates(cache_entry.result, call_instr)) {
new StoreInst(cache_entry.is_null_value, loaded_ptr, call_instr);
call_instr->replaceAllUsesWith(cache_entry.result);
call_instr->eraseFromParent();
}
}
}
// Step 3:
_codegen->inline_call_sites(fn, false);
return true;
}
}