// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #include "codegen/subexpr_elimination.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "llvm/Transforms/IPO.h" #include #include #include "common/logging.h" #include "codegen/subexpr_elimination.h" #include "doris_ir/doris_ir_names.h" #include "util/cpu_info.h" #include "util/path_builder.h" using llvm::CallInst; using llvm::BitCastInst; using llvm::Instruction; using llvm::LoadInst; using llvm::StoreInst; using llvm::Function; using llvm::Value; using llvm::DominatorTree; namespace doris { SubExprElimination::SubExprElimination(LlvmCodeGen* codegen) : _codegen(codegen) { } // Before running the standard llvm optimization passes, first remove redundant calls // to slotref expression. SlotRefs are more heavyweight due to the null handling that // is required and after they are inlined, llvm is unable to eliminate the redundant // inlined code blocks. // For example: // select colA + colA would generate an inner loop with 2 calls to the colA slot ref, // rather than doing subexpression elimination. To handle this, we will: // 1. inline all call sites in the original function except calls to SlotRefs // 2. for all call sites to SlotRefs except the first to that SlotRef, replace the // results from the secondary calls with the result from the first and remove // the call instruction. // 3. Inline calls to the SlotRefs (there should only be one for each slot ref). // // In the above example, the input function would look something like: // int ArithmeticAdd(TupleRow* row, bool* is_null) { // bool lhs_is_null, rhs_is_null; // int lhs_value = SlotRef(row, &lhs_is_null); // if (lhs_is_null) { *is_null = true; return 0; } // int rhs_value = SlotRef(row, &rhs_is_null); // if (rhs_is_null) { *is_null = true; return 0; } // *is_null = false; return lhs_value + rhs_value; // } // During step 2, we'd substitute the second call to SlotRef with the results from // the first call. // int ArithmeticAdd(TupleRow* row, bool* is_null) { // bool lhs_is_null, rhs_is_null; // int lhs_value = SlotRef(row, &lhs_is_null); // if (lhs_is_null) { *is_null = true; return 0; } // int rhs_value = lhs_value; // rhs_is_null = lhs_is_null; // if (rhs_is_null) { *is_null = true; return 0; } // *is_null = false; return lhs_value + rhs_value; // } // And then rely on llvm to finish the removing the redundant code, resulting in: // int ArithmeticAdd(TupleRow* row, bool* is_null) { // bool lhs_is_null, rhs_is_null; // int lhs_value = SlotRef(row, &lhs_is_null); // if (lhs_is_null) { *is_null = true; return 0; } // *is_null = false; return lhs_value + lhs_value; // } // Details on how to do this: // http://llvm.org/docs/ProgrammersManual.html#replacing-an-instruction-with-another-value // Step 2 requires more manipulation to ensure the resulting IR is still valid IR. // The call to the expr returns two things, both of which need to be replaced. // The value of the function as the return argument and whether or not the result was // null as a function output argument. // 1. The return value is trivial since with SSA, it is easy to identity all uses of // We simply replace the subsequent call instructions with the value. // 2. For the is_null result ptr, we replace the call to the expr with a store // instruction of the cached value. // i.e: // val1 = Call(is_null_ptr); // is_null1 = *is_null_ptr // ... // val2 = Call(is_null_ptr); // is_null2 = *is_null_ptr // Becomes: // val1 = Call(is_null_ptr); // is_null1 = *is_null_ptr // ... // val2 = val1; // *is_null_ptr = is_null1; // is_null2 = *is_null_ptr // We do this because the is_null ptr is not SSA form, making manipulating it // complex. The above approach exactly preserves the Call function, including // all writes to ptrs. We then rely on the llvm load/store removal pass which // will remove the redundant loads (which is tricky since you have to track // other instructions that wrote to the ptr, etc). // When doing the eliminations, we need to consider the call graph to make sure // the instruction we are replacing with dominates the instruction we are replacing; // that is, we need to guarantee the instruction we are replacing with always executes // before the replacee instruction in all code paths. // TODO: remove all this with expr refactoring. Everything will be SSA form then. struct CachedExprResult { // First function call result. Subsequent calls will be replaced with this value CallInst* result; // First is null result. Subsequent calls will be replaced with this value. Instruction* is_null_value; }; bool SubExprElimination::run(Function* fn) { // Step 1: int num_inlined = 0; do { // This assumes that all redundant exprs have been registered. num_inlined = _codegen->inline_call_sites(fn, true); } while (num_inlined > 0); // Mapping of (expr eval function, its 'row' arg) to cached result. We want to remove // redundant calls to the same function with the same argument. std::map, CachedExprResult> cached_slot_ref_results; // Step 2: DominatorTree dom_tree; dom_tree.runOnFunction(*fn); llvm::inst_iterator fn_end = llvm::inst_end(fn); llvm::inst_iterator instr_iter = llvm::inst_begin(fn); // Loop over every instruction in the function. while (instr_iter != fn_end) { Instruction* instr = &*instr_iter; ++instr_iter; // Look for call instructions if (!CallInst::classof(instr)) { continue; } CallInst* call_instr = reinterpret_cast(instr); Function* called_fn = call_instr->getCalledFunction(); if (_codegen->_registered_exprs.find(called_fn) == _codegen->_registered_exprs.end()) { continue; } // Found a registered expr function. We generate the IR in a very specific way // when calling the expr. The call instruction is always followed by loading the // resulting is_null result. We need to update both. // TODO: we need to update this to do more analysis since we are relying on a very // specific code structure to do this. // Arguments are (row, scratch_buffer, is_null); DCHECK_EQ(call_instr->getNumArgOperands(), 3); Value* row_arg = call_instr->getArgOperand(0); DCHECK(BitCastInst::classof(row_arg)); BitCastInst* row_cast = reinterpret_cast(row_arg); // Get at the underlying row arg. We need to differentiate between // call Fn(row1) and call Fn(row2). (identical fns but different input). row_arg = row_cast->getOperand(0); instr = &*instr_iter; ++instr_iter; if (!LoadInst::classof(instr)) { continue; } LoadInst* is_null_value = reinterpret_cast(instr); Value* loaded_ptr = is_null_value->getPointerOperand(); // Subexpr elimination requires the IR to be a very specific form. // call SlotRef(row, NULL, is_null_ptr) // load is_null_ptr // Since we generate this IR currently, we can enforce this logic in our exprs // TODO: this should be removed/generalized with expr refactoring DCHECK_EQ(loaded_ptr, call_instr->getArgOperand(2)); std::pair call_desc = std::make_pair(called_fn, row_arg); if (cached_slot_ref_results.find(call_desc) == cached_slot_ref_results.end()) { CachedExprResult cache_entry; cache_entry.result = call_instr; cache_entry.is_null_value = is_null_value; cached_slot_ref_results[call_desc] = cache_entry; } else { // Reuse the result. CachedExprResult& cache_entry = cached_slot_ref_results[call_desc]; if (dom_tree.dominates(cache_entry.result, call_instr)) { new StoreInst(cache_entry.is_null_value, loaded_ptr, call_instr); call_instr->replaceAllUsesWith(cache_entry.result); call_instr->eraseFromParent(); } } } // Step 3: _codegen->inline_call_sites(fn, false); return true; } }