# Modifications copyright (C) 2017, Baidu.com, Inc. # Copyright 2017 The Apache Software Foundation # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. """ # This script will generate the implementation of the simple vector functions for the BE. # These include: # - Arithmetic functions # - Binary functions # - Cast functions # # The script outputs (run: 'src/common/function/gen_vector_functions.py') # - header and implemention for above functions: # - src/gen_cpp/opcode/vector_functions.[h/cc] # - python file that contains the metadata for those functions: # - src/gen_cpp/generated_vector_functions.py """ import string import os filter_binary_op = string.Template("\ bool VectorComputeFunctions::${fn_signature}(\n\ Expr* expr, VectorizedRowBatch* batch) {\n\ int n = batch->size();\n\ if (0 == n) {\n\ return false;\n\ }\n\ int* sel = batch->selected();\n\ Expr* op1 = expr->children()[0];\n\ Expr* op2 = expr->children()[1];\n\ batch->add_column(expr->output_column(), expr->type());\n\ if (expr->is_constant()) {\n\ ${native_type1}* val1 = reinterpret_cast<${native_type1}*>(op1->get_value(NULL));\n\ ${native_type2}* val2 = reinterpret_cast<${native_type2}*>(op2->get_value(NULL));\n\ if (val1 == NULL || val2 == NULL) return false;\n\ if (!(*val1 ${native_op} *val2)) batch->set_size(0);\n\ } else if (op1->is_constant()) {\n\ ${native_type1}* value = reinterpret_cast<${native_type1}*>(op1->get_value(NULL));\n\ if (NULL == value || !op2->evaluate(batch)) return false;\n\ ${native_type1}* vector1\n\ = reinterpret_cast<${native_type1}*>(batch->column(op2->output_column())->col_data());\n\ \n\ int new_size = 0;\n\ if (batch->selected_in_use()) {\n\ for (int j = 0; j != n; ++j) {\n\ int i = sel[j];\n\ if (*value ${native_op} vector1[i]) {\n\ sel[new_size++] = i;\n\ }\n\ }\n\ batch->set_size(new_size);\n\ } else {\n\ for (int i = 0; i != n; ++i) {\n\ if (*value ${native_op} vector1[i]) {\n\ sel[new_size++] = i;\n\ }\n\ }\n\ \n\ if (new_size < n) {\n\ batch->set_size(new_size);\n\ batch->set_selected_in_use(true);\n\ }\n\ }\n\ } else if (op2->is_constant()) {\n\ ${native_type2}* value = reinterpret_cast<${native_type2}*>(op2->get_value(NULL));\n\ if (NULL == value || !op1->evaluate(batch)) return false;\n\ ${native_type1}* vector1\n\ = reinterpret_cast<${native_type1}*>(batch->column(op1->output_column())->col_data());\n\ \n\ int new_size = 0;\n\ if (batch->selected_in_use()) {\n\ for (int j = 0; j != n; ++j) {\n\ int i = sel[j];\n\ if (vector1[i] ${native_op} *value) {\n\ sel[new_size++] = i;\n\ }\n\ }\n\ batch->set_size(new_size);\n\ } else {\n\ for (int i = 0; i != n; ++i) {\n\ if (vector1[i] ${native_op} *value) {\n\ sel[new_size++] = i;\n\ }\n\ }\n\ \n\ if (new_size < n) {\n\ batch->set_size(new_size);\n\ batch->set_selected_in_use(true);\n\ }\n\ }\n\ } else {\n\ if (!op1->evaluate(batch) || !op2->evaluate(batch)) return false;\n\ ${native_type1}* vector1\n\ = reinterpret_cast<${native_type1}*>(batch->column(op1->output_column())->col_data());\n\ ${native_type2}* vector2\n\ = reinterpret_cast<${native_type2}*>(batch->column(op2->output_column())->col_data());\n\ \n\ int new_size = 0;\n\ if (batch->selected_in_use()) {\n\ for (int j = 0; j != n; ++j) {\n\ int i = sel[j];\n\ if (vector1[i] ${native_op} vector2[i]) {\n\ sel[new_size++] = i;\n\ }\n\ }\n\ batch->set_size(new_size);\n\ } else {\n\ for (int i = 0; i != n; ++i) {\n\ if (vector1[i] ${native_op} vector2[i]) {\n\ sel[new_size++] = i;\n\ }\n\ }\n\ if (new_size < n) {\n\ batch->set_size(new_size);\n\ batch->set_selected_in_use(true);\n\ }\n\ }\n\ }\n\ return true;\n\ }\n\n") filter_in_op = string.Template("\ bool VectorComputeFunctions::${fn_signature}(\n\ Expr* expr, VectorizedRowBatch* batch) {\n\ int n = batch->size();\n\ if (0 == n) {\n\ return true;\n\ }\n\ batch->add_column(expr->output_column(), expr->type());\n\ int* sel = batch->selected();\n\ int num_children = expr->get_num_children();\n\ Expr* op1 = expr->children()[0];\n\ InPredicate *in_pred = static_cast(expr);\n\ \n\ if (op1->is_constant()) {\n\ void* value = op1->get_value(NULL);\n\ if (!in_pred->hybird_set()->find(value)) {\n\ batch->set_size(0);\n\ return true;\n\ }\n\ \n\ if (num_children > 1) {\n\ ${native_type1}* v = reinterpret_cast<${native_type1}*>(value);\n\ ${native_type1}* vectors[num_children];\n\ for (int i = 1; i < num_children; ++i) {\n\ if (expr->get_child(i)->evaluate(batch)) return false;\n\ vectors[i] = reinterpret_cast<${native_type1}*>(batch->column(expr->get_child(i)->output_column())->col_data());\n\ }\n\ \n\ int new_size = 0;\n\ if (batch->selected_in_use()) {\n\ for (int j = 0; j != n; ++j) {\n\ int i = sel[j];\n\ for (int k = 1; k < num_children; ++k) {\n\ if (*v == vectors[k][i]) {\n\ sel[new_size++] = i;\n\ break;\n\ }\n\ }\n\ }\n\ batch->set_size(new_size);\n\ } else {\n\ for (int i = 0; i != n; ++i) {\n\ for (int k = 1; k < num_children; ++k) {\n\ if (*v == vectors[k][i]) {\n\ sel[new_size++] = i;\n\ break;\n\ }\n\ }\n\ }\n\ \n\ if (new_size < n) {\n\ batch->set_size(new_size);\n\ batch->set_selected_in_use(true);\n\ }\n\ }\n\ }\n\ } else {\n\ int c1 = op1->evaluate(batch);\n\ DCHECK(c1 >= 0);\n\ ${native_type1}* vector1 \n\ =reinterpret_cast<${native_type1}*>(batch->column(op1->output_column())->col_data());\n\ if (0 != in_pred->hybird_set()->size()) {\n\ int new_size = 0;\n\ if (batch->selected_in_use()) {\n\ for (int j = 0; j != n; ++j) {\n\ int i = sel[j];\n\ if (in_pred->hybird_set()->find(&vector1[i])) {\n\ sel[new_size++] = i;\n\ }\n\ }\n\ batch->set_size(new_size);\n\ } else {\n\ for (int i = 0; i != n; ++i) {\n\ if (in_pred->hybird_set()->find(&vector1[i])) {\n\ sel[new_size++] = i;\n\ }\n\ }\n\ \n\ if (new_size < n) {\n\ batch->set_size(new_size);\n\ batch->set_selected_in_use(true);\n\ }\n\ }\n\ }\n\ \n\ if (num_children > 1) {\n\ ${native_type1}* vectors[num_children];\n\ for (int i = 1; i < num_children; ++i) {\n\ if (!expr->get_child(i)->evaluate(batch)) return false;\n\ vectors[i] = reinterpret_cast<${native_type1}*>(batch->column(expr->get_child(i)->output_column())->col_data());\n\ }\n\ \n\ int new_size = 0;\n\ if (batch->selected_in_use()) {\n\ for (int j = 0; j != n; ++j) {\n\ int i = sel[j];\n\ for (int k = 1; k < num_children; ++k) {\n\ if (vector1[i] == vectors[k][i]) {\n\ sel[new_size++] = i;\n\ break;\n\ }\n\ }\n\ }\n\ batch->set_size(new_size);\n\ } else {\n\ for (int i = 0; i != n; ++i) {\n\ for (int k = 1; k < num_children; ++k) {\n\ if (vector1[i] == vectors[k][i]) {\n\ sel[new_size++] = i;\n\ break;\n\ }\n\ }\n\ }\n\ \n\ if (new_size < n) {\n\ batch->set_size(new_size);\n\ batch->set_selected_in_use(true);\n\ }\n\ }\n\ }\n\ }\n\ return true;\n\ }\n\n") python_template = string.Template("\ ['${fn_name}', '${return_type}', [${args}], 'VectorComputeFunctions::${fn_signature}', []], \n") # Mapping of function to template templates = { 'Filter_Eq': filter_binary_op, 'Filter_Ne': filter_binary_op, 'Filter_Gt': filter_binary_op, 'Filter_Lt': filter_binary_op, 'Filter_Ge': filter_binary_op, 'Filter_Le': filter_binary_op, 'Filter_In': filter_in_op, } # Some aggregate types that are useful for defining functions types = { 'BOOLEAN': ['BOOLEAN'], 'TINYINT': ['TINYINT'], 'SMALLINT': ['SMALLINT'], 'INT': ['INT'], 'BIGINT': ['BIGINT'], 'LARGEINT': ['LARGEINT'], 'FLOAT': ['FLOAT'], 'DOUBLE': ['DOUBLE'], 'STRING': ['VARCHAR'], 'DATE': ['DATE'], 'DATETIME': ['DATETIME'], 'DECIMAL': ['DECIMAL'], 'NATIVE_INT_TYPES': ['TINYINT', 'SMALLINT', 'INT', 'BIGINT'], 'INT_TYPES': ['TINYINT', 'SMALLINT', 'INT', 'BIGINT', 'LARGEINT'], 'FLOAT_TYPES': ['FLOAT', 'DOUBLE'], 'NUMERIC_TYPES': ['TINYINT', 'SMALLINT', 'INT', 'BIGINT', 'FLOAT', 'DOUBLE'], 'NATIVE_TYPES': ['BOOLEAN', 'TINYINT', 'SMALLINT', 'INT', 'BIGINT', 'FLOAT', 'DOUBLE'], 'STRCAST_TYPES': ['BOOLEAN', 'SMALLINT', 'INT', 'BIGINT', 'FLOAT', 'DOUBLE'], 'ALL_TYPES': ['BOOLEAN', 'TINYINT', 'SMALLINT', 'INT', 'BIGINT', 'LARGEINT', 'FLOAT',\ 'DOUBLE', 'VARCHAR', 'DATETIME', 'DECIMAL'], 'MAX_TYPES': ['BIGINT', 'LARGEINT', 'DOUBLE', 'DECIMAL'], } # Operation, [ReturnType], [[Args1], [Args2], ... [ArgsN]] functions = [ # BinaryPredicates ['Filter_Eq', ['BOOLEAN'], [['ALL_TYPES'], ['ALL_TYPES']]], ['Filter_Ne', ['BOOLEAN'], [['ALL_TYPES'], ['ALL_TYPES']]], ['Filter_Gt', ['BOOLEAN'], [['ALL_TYPES'], ['ALL_TYPES']]], ['Filter_Lt', ['BOOLEAN'], [['ALL_TYPES'], ['ALL_TYPES']]], ['Filter_Ge', ['BOOLEAN'], [['ALL_TYPES'], ['ALL_TYPES']]], ['Filter_Le', ['BOOLEAN'], [['ALL_TYPES'], ['ALL_TYPES']]], # InPredicates ['Filter_In', ['BOOLEAN'], [['ALL_TYPES']]], ] native_types = { 'BOOLEAN': 'bool', 'TINYINT': 'char', 'SMALLINT': 'short', 'INT': 'int', 'BIGINT': 'long', 'LARGEINT': '__int128', 'FLOAT': 'float', 'DOUBLE': 'double', 'VARCHAR': 'StringValue', 'DATE': 'DateTimeValue', 'DATETIME': 'DateTimeValue', 'DECIMAL': 'DecimalValue', } # Portable type used in the function implementation implemented_types = { 'BOOLEAN': 'bool', 'TINYINT': 'int8_t', 'SMALLINT': 'int16_t', 'INT': 'int32_t', 'BIGINT': 'int64_t', 'LARGEINT': '__int128', 'FLOAT': 'float', 'DOUBLE': 'double', 'VARCHAR': 'StringValue', 'DATE': 'DateTimeValue', 'DATETIME': 'DateTimeValue', 'DECIMAL': 'DecimalValue', } native_ops = { 'Filter_Eq': '==', 'Filter_Ne': '!=', 'Filter_Gt': '>', 'Filter_Lt': '<', 'Filter_Ge': '>=', 'Filter_Le': '<=', 'Eq': '==', 'Ne': '!=', 'Gt': '>', 'Lt': '<', 'Ge': '>=', 'Le': '<=', 'BITAND': '&', 'BITNOT': '~', 'BITOR': '|', 'BITXOR': '^', 'DIVIDE': '/', 'EQ': '==', 'GT': '>', 'GE': '>=', 'INT_DIVIDE': '/', 'SUBTRACT': '-', 'MOD': '%', 'MULTIPLY': '*', 'LT': '<', 'LE': '<=', 'NE': '!=', 'ADD': '+', } native_funcs = { 'EQ': 'Eq', 'LE': 'Le', 'LT': 'Lt', 'NE': 'Ne', 'GE': 'Ge', 'GT': 'Gt', } cc_preamble = '\ /***************************************************************************\n\ * \n\ * Copyright (c) 2014 Baidu.com, Inc. All Rights Reserved\n\ * \n\ **************************************************************************/\n\ \n\ // This is a generated file, DO NOT EDIT IT.\n\ // To add new functions, see impala/common/function-registry/gen_vector_functions.py\n\ \n\ #include "gen_cpp/opcode/vector-functions.h"\n\ #include "exprs/case_expr.h"\n\ #include "exprs/expr.h"\n\ #include "exprs/in_predicate.h"\n\ #include "runtime/string_value.hpp"\n\ #include "runtime/vectorized_row_batch.h"\n\ #include "util/string_parser.hpp"\n\ #include \n\ \n\ using namespace boost;\n\ using namespace std;\n\ \n\ namespace palo { \n\ \n' cc_epilogue = '\ }\n' h_preamble = '\ /***************************************************************************\n\ * \n\ * Copyright (c) 2014 Baidu.com, Inc. All Rights Reserved\n\ * \n\ **************************************************************************/\n\ \n\ // This is a generated file, DO NOT EDIT IT.\n\ // To add new functions, see impala/common/function-registry/gen_vector_functions.py\n\ \n\ #ifndef BDG_PALO_OPCODE_VECTOR_FUNCTIONS_H\n\ #define BDG_PALO_OPCODE_VECTOR_FUNCTIONS_H\n\ \n\ namespace palo {\n\ class Expr;\n\ class OpcodeRegistry;\n\ class VectorizedRowBatch;\n\ \n\ class VectorComputeFunctions {\n\ public:\n' h_epilogue = '\ };\n\ \n\ }\n\ \n\ #endif\n' python_preamble = '\ #!/usr/bin/env python\n\ # Modifications copyright (C) 2017, Baidu.com, Inc. \n\ # Copyright 2017 The Apache Software Foundation \n\ # \n\ # Licensed to the Apache Software Foundation (ASF) under one \n\ # or more contributor license agreements. See the NOTICE file \n\ # distributed with this work for additional information \n\ # regarding copyright ownership. The ASF licenses this file \n\ # to you under the Apache License, Version 2.0 (the \n\ # "License"); you may not use this file except in compliance \n\ # with the License. You may obtain a copy of the License at \n\ # \n\ # http://www.apache.org/licenses/LICENSE-2.0\n\ # \n\ # Unless required by applicable law or agreed to in writing, software\n\ # distributed under the License is distributed on an "AS IS" BASIS,\n\ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n\ # See the License for the specific language governing permissions and\n\ # limitations under the License.\n\ \n\ # This is a generated file, DO NOT EDIT IT.\n\ # To add new functions, see impala/common/function-registry/gen_opcodes.py\n\ \n\ functions = [\n' python_epilogue = ']' header_template = string.Template("\ static bool ${fn_signature}(\n\ Expr* e, VectorizedRowBatch* batch);\n") BE_PATH = "../gen_cpp/opcode/" if not os.path.exists(BE_PATH): os.makedirs(BE_PATH) def initialize_sub(op, return_type, arg_types): """ Expand the signature data for template substitution. Returns a dictionary with all the entries for all the templates used in this script """ sub = {} sub["fn_name"] = op sub["fn_signature"] = op sub["return_type"] = return_type sub["args"] = "" if op in native_ops: sub["native_op"] = native_ops[op] for idx in range(0, len(arg_types)): arg = arg_types[idx] sub["fn_signature"] += "_" + native_types[arg] sub["native_type" + repr(idx + 1)] = implemented_types[arg] sub["args"] += "'" + arg + "', " return sub if __name__ == "__main__": h_file = open(BE_PATH + 'vector-functions.h', 'w') cc_file = open(BE_PATH + 'vector-functions.cc', 'w') python_file = open('generated_vector_functions.py', 'w') h_file.write(h_preamble) cc_file.write(cc_preamble) python_file.write(python_preamble) # Generate functions and headers for func_data in functions: op = func_data[0] # If a specific template has been specified, use that one. if len(func_data) >= 4: template = func_data[3] else: # Skip functions with no template (shouldn't be auto-generated) if not op in templates: continue template = templates[op] # Expand all arguments return_types = [] for ret in func_data[1]: for t in types[ret]: return_types.append(t) signatures = [] for args in func_data[2]: expanded_arg = [] for arg in args: for t in types[arg]: expanded_arg.append(t) signatures.append(expanded_arg) # Put arguments into substitution structure num_functions = 0 for args in signatures: num_functions = max(num_functions, len(args)) num_functions = max(num_functions, len(return_types)) num_args = len(signatures) # Validate the input is correct if len(return_types) != 1 and len(return_types) != num_functions: print "Invalid Declaration: " + func_data sys.exit(1) for args in signatures: if len(args) != 1 and len(args) != num_functions: print "Invalid Declaration: " + func_data sys.exit(1) # Iterate over every function signature to generate for i in range(0, num_functions): if len(return_types) == 1: return_type = return_types[0] else: return_type = return_types[i] arg_types = [] for j in range(0, num_args): if len(signatures[j]) == 1: arg_types.append(signatures[j][0]) else: arg_types.append(signatures[j][i]) # At this point, 'return_type' is a single type and 'arg_types' # is a list of single types sub = initialize_sub(op, return_type, arg_types) h_file.write(header_template.substitute(sub)) cc_file.write(template.substitute(sub)) python_file.write(python_template.substitute(sub)) h_file.write(h_epilogue) cc_file.write(cc_epilogue) python_file.write(python_epilogue) h_file.close() cc_file.close() python_file.close()