558 lines
19 KiB
Python
Executable File
558 lines
19 KiB
Python
Executable File
# Licensed to the Apache Software Foundation (ASF) under one
|
|
# or more contributor license agreements. See the NOTICE file
|
|
# distributed with this work for additional information
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
# to you under the Apache License, Version 2.0 (the
|
|
# "License"); you may not use this file except in compliance
|
|
# with the License. You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing,
|
|
# software distributed under the License is distributed on an
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
# KIND, either express or implied. See the License for the
|
|
# specific language governing permissions and limitations
|
|
# under the License.
|
|
|
|
"""
|
|
# This script will generate the implementation of the simple vector functions for the BE.
|
|
# These include:
|
|
# - Arithmetic functions
|
|
# - Binary functions
|
|
# - Cast functions
|
|
#
|
|
# The script outputs (run: 'src/common/function/gen_vector_functions.py')
|
|
# - header and implemention for above functions:
|
|
# - src/gen_cpp/opcode/vector_functions.[h/cc]
|
|
# - python file that contains the metadata for those functions:
|
|
# - src/gen_cpp/generated_vector_functions.py
|
|
"""
|
|
|
|
import string
|
|
import os
|
|
|
|
filter_binary_op = string.Template("\
|
|
bool VectorComputeFunctions::${fn_signature}(\n\
|
|
Expr* expr, VectorizedRowBatch* batch) {\n\
|
|
int n = batch->size();\n\
|
|
if (0 == n) {\n\
|
|
return false;\n\
|
|
}\n\
|
|
int* sel = batch->selected();\n\
|
|
Expr* op1 = expr->children()[0];\n\
|
|
Expr* op2 = expr->children()[1];\n\
|
|
batch->add_column(expr->output_column(), expr->type());\n\
|
|
if (expr->is_constant()) {\n\
|
|
${native_type1}* val1 = reinterpret_cast<${native_type1}*>(op1->get_value(NULL));\n\
|
|
${native_type2}* val2 = reinterpret_cast<${native_type2}*>(op2->get_value(NULL));\n\
|
|
if (val1 == NULL || val2 == NULL) return false;\n\
|
|
if (!(*val1 ${native_op} *val2)) batch->set_size(0);\n\
|
|
} else if (op1->is_constant()) {\n\
|
|
${native_type1}* value = reinterpret_cast<${native_type1}*>(op1->get_value(NULL));\n\
|
|
if (NULL == value || !op2->evaluate(batch)) return false;\n\
|
|
${native_type1}* vector1\n\
|
|
= reinterpret_cast<${native_type1}*>(batch->column(op2->output_column())->col_data());\n\
|
|
\n\
|
|
int new_size = 0;\n\
|
|
if (batch->selected_in_use()) {\n\
|
|
for (int j = 0; j != n; ++j) {\n\
|
|
int i = sel[j];\n\
|
|
if (*value ${native_op} vector1[i]) {\n\
|
|
sel[new_size++] = i;\n\
|
|
}\n\
|
|
}\n\
|
|
batch->set_size(new_size);\n\
|
|
} else {\n\
|
|
for (int i = 0; i != n; ++i) {\n\
|
|
if (*value ${native_op} vector1[i]) {\n\
|
|
sel[new_size++] = i;\n\
|
|
}\n\
|
|
}\n\
|
|
\n\
|
|
if (new_size < n) {\n\
|
|
batch->set_size(new_size);\n\
|
|
batch->set_selected_in_use(true);\n\
|
|
}\n\
|
|
}\n\
|
|
} else if (op2->is_constant()) {\n\
|
|
${native_type2}* value = reinterpret_cast<${native_type2}*>(op2->get_value(NULL));\n\
|
|
if (NULL == value || !op1->evaluate(batch)) return false;\n\
|
|
${native_type1}* vector1\n\
|
|
= reinterpret_cast<${native_type1}*>(batch->column(op1->output_column())->col_data());\n\
|
|
\n\
|
|
int new_size = 0;\n\
|
|
if (batch->selected_in_use()) {\n\
|
|
for (int j = 0; j != n; ++j) {\n\
|
|
int i = sel[j];\n\
|
|
if (vector1[i] ${native_op} *value) {\n\
|
|
sel[new_size++] = i;\n\
|
|
}\n\
|
|
}\n\
|
|
batch->set_size(new_size);\n\
|
|
} else {\n\
|
|
for (int i = 0; i != n; ++i) {\n\
|
|
if (vector1[i] ${native_op} *value) {\n\
|
|
sel[new_size++] = i;\n\
|
|
}\n\
|
|
}\n\
|
|
\n\
|
|
if (new_size < n) {\n\
|
|
batch->set_size(new_size);\n\
|
|
batch->set_selected_in_use(true);\n\
|
|
}\n\
|
|
}\n\
|
|
} else {\n\
|
|
if (!op1->evaluate(batch) || !op2->evaluate(batch)) return false;\n\
|
|
${native_type1}* vector1\n\
|
|
= reinterpret_cast<${native_type1}*>(batch->column(op1->output_column())->col_data());\n\
|
|
${native_type2}* vector2\n\
|
|
= reinterpret_cast<${native_type2}*>(batch->column(op2->output_column())->col_data());\n\
|
|
\n\
|
|
int new_size = 0;\n\
|
|
if (batch->selected_in_use()) {\n\
|
|
for (int j = 0; j != n; ++j) {\n\
|
|
int i = sel[j];\n\
|
|
if (vector1[i] ${native_op} vector2[i]) {\n\
|
|
sel[new_size++] = i;\n\
|
|
}\n\
|
|
}\n\
|
|
batch->set_size(new_size);\n\
|
|
} else {\n\
|
|
for (int i = 0; i != n; ++i) {\n\
|
|
if (vector1[i] ${native_op} vector2[i]) {\n\
|
|
sel[new_size++] = i;\n\
|
|
}\n\
|
|
}\n\
|
|
if (new_size < n) {\n\
|
|
batch->set_size(new_size);\n\
|
|
batch->set_selected_in_use(true);\n\
|
|
}\n\
|
|
}\n\
|
|
}\n\
|
|
return true;\n\
|
|
}\n\n")
|
|
|
|
filter_in_op = string.Template("\
|
|
bool VectorComputeFunctions::${fn_signature}(\n\
|
|
Expr* expr, VectorizedRowBatch* batch) {\n\
|
|
int n = batch->size();\n\
|
|
if (0 == n) {\n\
|
|
return true;\n\
|
|
}\n\
|
|
batch->add_column(expr->output_column(), expr->type());\n\
|
|
int* sel = batch->selected();\n\
|
|
int num_children = expr->get_num_children();\n\
|
|
Expr* op1 = expr->children()[0];\n\
|
|
InPredicate *in_pred = static_cast<InPredicate*>(expr);\n\
|
|
\n\
|
|
if (op1->is_constant()) {\n\
|
|
void* value = op1->get_value(NULL);\n\
|
|
if (!in_pred->hybird_set()->find(value)) {\n\
|
|
batch->set_size(0);\n\
|
|
return true;\n\
|
|
}\n\
|
|
\n\
|
|
if (num_children > 1) {\n\
|
|
${native_type1}* v = reinterpret_cast<${native_type1}*>(value);\n\
|
|
${native_type1}* vectors[num_children];\n\
|
|
for (int i = 1; i < num_children; ++i) {\n\
|
|
if (expr->get_child(i)->evaluate(batch)) return false;\n\
|
|
vectors[i] = reinterpret_cast<${native_type1}*>(batch->column(expr->get_child(i)->output_column())->col_data());\n\
|
|
}\n\
|
|
\n\
|
|
int new_size = 0;\n\
|
|
if (batch->selected_in_use()) {\n\
|
|
for (int j = 0; j != n; ++j) {\n\
|
|
int i = sel[j];\n\
|
|
for (int k = 1; k < num_children; ++k) {\n\
|
|
if (*v == vectors[k][i]) {\n\
|
|
sel[new_size++] = i;\n\
|
|
break;\n\
|
|
}\n\
|
|
}\n\
|
|
}\n\
|
|
batch->set_size(new_size);\n\
|
|
} else {\n\
|
|
for (int i = 0; i != n; ++i) {\n\
|
|
for (int k = 1; k < num_children; ++k) {\n\
|
|
if (*v == vectors[k][i]) {\n\
|
|
sel[new_size++] = i;\n\
|
|
break;\n\
|
|
}\n\
|
|
}\n\
|
|
}\n\
|
|
\n\
|
|
if (new_size < n) {\n\
|
|
batch->set_size(new_size);\n\
|
|
batch->set_selected_in_use(true);\n\
|
|
}\n\
|
|
}\n\
|
|
}\n\
|
|
} else {\n\
|
|
int c1 = op1->evaluate(batch);\n\
|
|
DCHECK(c1 >= 0);\n\
|
|
${native_type1}* vector1 \n\
|
|
=reinterpret_cast<${native_type1}*>(batch->column(op1->output_column())->col_data());\n\
|
|
if (0 != in_pred->hybird_set()->size()) {\n\
|
|
int new_size = 0;\n\
|
|
if (batch->selected_in_use()) {\n\
|
|
for (int j = 0; j != n; ++j) {\n\
|
|
int i = sel[j];\n\
|
|
if (in_pred->hybird_set()->find(&vector1[i])) {\n\
|
|
sel[new_size++] = i;\n\
|
|
}\n\
|
|
}\n\
|
|
batch->set_size(new_size);\n\
|
|
} else {\n\
|
|
for (int i = 0; i != n; ++i) {\n\
|
|
if (in_pred->hybird_set()->find(&vector1[i])) {\n\
|
|
sel[new_size++] = i;\n\
|
|
}\n\
|
|
}\n\
|
|
\n\
|
|
if (new_size < n) {\n\
|
|
batch->set_size(new_size);\n\
|
|
batch->set_selected_in_use(true);\n\
|
|
}\n\
|
|
}\n\
|
|
}\n\
|
|
\n\
|
|
if (num_children > 1) {\n\
|
|
${native_type1}* vectors[num_children];\n\
|
|
for (int i = 1; i < num_children; ++i) {\n\
|
|
if (!expr->get_child(i)->evaluate(batch)) return false;\n\
|
|
vectors[i] = reinterpret_cast<${native_type1}*>(batch->column(expr->get_child(i)->output_column())->col_data());\n\
|
|
}\n\
|
|
\n\
|
|
int new_size = 0;\n\
|
|
if (batch->selected_in_use()) {\n\
|
|
for (int j = 0; j != n; ++j) {\n\
|
|
int i = sel[j];\n\
|
|
for (int k = 1; k < num_children; ++k) {\n\
|
|
if (vector1[i] == vectors[k][i]) {\n\
|
|
sel[new_size++] = i;\n\
|
|
break;\n\
|
|
}\n\
|
|
}\n\
|
|
}\n\
|
|
batch->set_size(new_size);\n\
|
|
} else {\n\
|
|
for (int i = 0; i != n; ++i) {\n\
|
|
for (int k = 1; k < num_children; ++k) {\n\
|
|
if (vector1[i] == vectors[k][i]) {\n\
|
|
sel[new_size++] = i;\n\
|
|
break;\n\
|
|
}\n\
|
|
}\n\
|
|
}\n\
|
|
\n\
|
|
if (new_size < n) {\n\
|
|
batch->set_size(new_size);\n\
|
|
batch->set_selected_in_use(true);\n\
|
|
}\n\
|
|
}\n\
|
|
}\n\
|
|
}\n\
|
|
return true;\n\
|
|
}\n\n")
|
|
|
|
python_template = string.Template("\
|
|
['${fn_name}', '${return_type}', [${args}], 'VectorComputeFunctions::${fn_signature}', []], \n")
|
|
|
|
# Mapping of function to template
|
|
templates = {
|
|
'Filter_Eq': filter_binary_op,
|
|
'Filter_Ne': filter_binary_op,
|
|
'Filter_Gt': filter_binary_op,
|
|
'Filter_Lt': filter_binary_op,
|
|
'Filter_Ge': filter_binary_op,
|
|
'Filter_Le': filter_binary_op,
|
|
'Filter_In': filter_in_op,
|
|
}
|
|
|
|
# Some aggregate types that are useful for defining functions
|
|
types = {
|
|
'BOOLEAN': ['BOOLEAN'],
|
|
'TINYINT': ['TINYINT'],
|
|
'SMALLINT': ['SMALLINT'],
|
|
'INT': ['INT'],
|
|
'BIGINT': ['BIGINT'],
|
|
'LARGEINT': ['LARGEINT'],
|
|
'FLOAT': ['FLOAT'],
|
|
'DOUBLE': ['DOUBLE'],
|
|
'STRING': ['VARCHAR'],
|
|
'DATE': ['DATE'],
|
|
'DATETIME': ['DATETIME'],
|
|
'DECIMAL': ['DECIMAL'],
|
|
'NATIVE_INT_TYPES': ['TINYINT', 'SMALLINT', 'INT', 'BIGINT'],
|
|
'INT_TYPES': ['TINYINT', 'SMALLINT', 'INT', 'BIGINT', 'LARGEINT'],
|
|
'FLOAT_TYPES': ['FLOAT', 'DOUBLE'],
|
|
'NUMERIC_TYPES': ['TINYINT', 'SMALLINT', 'INT', 'BIGINT', 'FLOAT', 'DOUBLE'],
|
|
'NATIVE_TYPES': ['BOOLEAN', 'TINYINT', 'SMALLINT', 'INT', 'BIGINT', 'FLOAT', 'DOUBLE'],
|
|
'STRCAST_TYPES': ['BOOLEAN', 'SMALLINT', 'INT', 'BIGINT', 'FLOAT', 'DOUBLE'],
|
|
'ALL_TYPES': ['BOOLEAN', 'TINYINT', 'SMALLINT', 'INT', 'BIGINT', 'LARGEINT', 'FLOAT',\
|
|
'DOUBLE', 'VARCHAR', 'DATETIME', 'DECIMAL'],
|
|
'MAX_TYPES': ['BIGINT', 'LARGEINT', 'DOUBLE', 'DECIMAL'],
|
|
}
|
|
|
|
# Operation, [ReturnType], [[Args1], [Args2], ... [ArgsN]]
|
|
functions = [
|
|
# BinaryPredicates
|
|
['Filter_Eq', ['BOOLEAN'], [['ALL_TYPES'], ['ALL_TYPES']]],
|
|
['Filter_Ne', ['BOOLEAN'], [['ALL_TYPES'], ['ALL_TYPES']]],
|
|
['Filter_Gt', ['BOOLEAN'], [['ALL_TYPES'], ['ALL_TYPES']]],
|
|
['Filter_Lt', ['BOOLEAN'], [['ALL_TYPES'], ['ALL_TYPES']]],
|
|
['Filter_Ge', ['BOOLEAN'], [['ALL_TYPES'], ['ALL_TYPES']]],
|
|
['Filter_Le', ['BOOLEAN'], [['ALL_TYPES'], ['ALL_TYPES']]],
|
|
|
|
# InPredicates
|
|
['Filter_In', ['BOOLEAN'], [['ALL_TYPES']]],
|
|
]
|
|
|
|
native_types = {
|
|
'BOOLEAN': 'bool',
|
|
'TINYINT': 'char',
|
|
'SMALLINT': 'short',
|
|
'INT': 'int',
|
|
'BIGINT': 'long',
|
|
'LARGEINT': '__int128',
|
|
'FLOAT': 'float',
|
|
'DOUBLE': 'double',
|
|
'VARCHAR': 'StringValue',
|
|
'DATE': 'DateTimeValue',
|
|
'DATETIME': 'DateTimeValue',
|
|
'DECIMAL': 'DecimalValue',
|
|
}
|
|
|
|
# Portable type used in the function implementation
|
|
implemented_types = {
|
|
'BOOLEAN': 'bool',
|
|
'TINYINT': 'int8_t',
|
|
'SMALLINT': 'int16_t',
|
|
'INT': 'int32_t',
|
|
'BIGINT': 'int64_t',
|
|
'LARGEINT': '__int128',
|
|
'FLOAT': 'float',
|
|
'DOUBLE': 'double',
|
|
'VARCHAR': 'StringValue',
|
|
'DATE': 'DateTimeValue',
|
|
'DATETIME': 'DateTimeValue',
|
|
'DECIMAL': 'DecimalValue',
|
|
}
|
|
|
|
native_ops = {
|
|
'Filter_Eq': '==',
|
|
'Filter_Ne': '!=',
|
|
'Filter_Gt': '>',
|
|
'Filter_Lt': '<',
|
|
'Filter_Ge': '>=',
|
|
'Filter_Le': '<=',
|
|
'Eq': '==',
|
|
'Ne': '!=',
|
|
'Gt': '>',
|
|
'Lt': '<',
|
|
'Ge': '>=',
|
|
'Le': '<=',
|
|
'BITAND': '&',
|
|
'BITNOT': '~',
|
|
'BITOR': '|',
|
|
'BITXOR': '^',
|
|
'DIVIDE': '/',
|
|
'EQ': '==',
|
|
'GT': '>',
|
|
'GE': '>=',
|
|
'INT_DIVIDE': '/',
|
|
'SUBTRACT': '-',
|
|
'MOD': '%',
|
|
'MULTIPLY': '*',
|
|
'LT': '<',
|
|
'LE': '<=',
|
|
'NE': '!=',
|
|
'ADD': '+',
|
|
}
|
|
|
|
native_funcs = {
|
|
'EQ': 'Eq',
|
|
'LE': 'Le',
|
|
'LT': 'Lt',
|
|
'NE': 'Ne',
|
|
'GE': 'Ge',
|
|
'GT': 'Gt',
|
|
}
|
|
|
|
cc_preamble = '\
|
|
// This is a generated file, DO NOT EDIT IT.\n\
|
|
// To add new functions, see impala/common/function-registry/gen_vector_functions.py\n\
|
|
\n\
|
|
#include "gen_cpp/opcode/vector-functions.h"\n\
|
|
#include "exprs/case_expr.h"\n\
|
|
#include "exprs/expr.h"\n\
|
|
#include "exprs/in_predicate.h"\n\
|
|
#include "runtime/string_value.hpp"\n\
|
|
#include "runtime/vectorized_row_batch.h"\n\
|
|
#include "util/string_parser.hpp"\n\
|
|
#include <boost/lexical_cast.hpp>\n\
|
|
\n\
|
|
using namespace boost;\n\
|
|
using namespace std;\n\
|
|
\n\
|
|
namespace doris { \n\
|
|
\n'
|
|
|
|
cc_epilogue = '\
|
|
}\n'
|
|
|
|
h_preamble = '\
|
|
// This is a generated file, DO NOT EDIT IT.\n\
|
|
// To add new functions, see impala/common/function-registry/gen_vector_functions.py\n\
|
|
\n\
|
|
#ifndef DORIS_OPCODE_VECTOR_FUNCTIONS_H\n\
|
|
#define DORIS_OPCODE_VECTOR_FUNCTIONS_H\n\
|
|
\n\
|
|
namespace doris {\n\
|
|
class Expr;\n\
|
|
class OpcodeRegistry;\n\
|
|
class VectorizedRowBatch;\n\
|
|
\n\
|
|
class VectorComputeFunctions {\n\
|
|
public:\n'
|
|
|
|
h_epilogue = '\
|
|
};\n\
|
|
\n\
|
|
}\n\
|
|
\n\
|
|
#endif\n'
|
|
|
|
python_preamble = '\
|
|
#!/usr/bin/env python\n\
|
|
# Licensed to the Apache Software Foundation (ASF) under one \n\
|
|
# or more contributor license agreements. See the NOTICE file \n\
|
|
# distributed with this work for additional information \n\
|
|
# regarding copyright ownership. The ASF licenses this file \n\
|
|
# to you under the Apache License, Version 2.0 (the \n\
|
|
# "License"); you may not use this file except in compliance \n\
|
|
# with the License. You may obtain a copy of the License at \n\
|
|
# \n\
|
|
# http://www.apache.org/licenses/LICENSE-2.0\n\
|
|
# \n\
|
|
# Unless required by applicable law or agreed to in writing, software\n\
|
|
# distributed under the License is distributed on an "AS IS" BASIS,\n\
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n\
|
|
# See the License for the specific language governing permissions and\n\
|
|
# limitations under the License.\n\
|
|
\n\
|
|
# This is a generated file, DO NOT EDIT IT.\n\
|
|
# To add new functions, see impala/common/function-registry/gen_opcodes.py\n\
|
|
\n\
|
|
functions = [\n'
|
|
|
|
python_epilogue = ']'
|
|
|
|
header_template = string.Template("\
|
|
static bool ${fn_signature}(\n\
|
|
Expr* e, VectorizedRowBatch* batch);\n")
|
|
|
|
BE_PATH = "../gen_cpp/opcode/"
|
|
if not os.path.exists(BE_PATH):
|
|
os.makedirs(BE_PATH)
|
|
|
|
def initialize_sub(op, return_type, arg_types):
|
|
"""
|
|
Expand the signature data for template substitution. Returns
|
|
a dictionary with all the entries for all the templates used in this script
|
|
"""
|
|
sub = {}
|
|
sub["fn_name"] = op
|
|
sub["fn_signature"] = op
|
|
sub["return_type"] = return_type
|
|
sub["args"] = ""
|
|
if op in native_ops:
|
|
sub["native_op"] = native_ops[op]
|
|
for idx in range(0, len(arg_types)):
|
|
arg = arg_types[idx]
|
|
sub["fn_signature"] += "_" + native_types[arg]
|
|
sub["native_type" + repr(idx + 1)] = implemented_types[arg]
|
|
sub["args"] += "'" + arg + "', "
|
|
return sub
|
|
|
|
if __name__ == "__main__":
|
|
h_file = open(BE_PATH + 'vector-functions.h', 'w')
|
|
cc_file = open(BE_PATH + 'vector-functions.cc', 'w')
|
|
python_file = open('generated_vector_functions.py', 'w')
|
|
h_file.write(h_preamble)
|
|
cc_file.write(cc_preamble)
|
|
python_file.write(python_preamble)
|
|
|
|
# Generate functions and headers
|
|
for func_data in functions:
|
|
op = func_data[0]
|
|
# If a specific template has been specified, use that one.
|
|
if len(func_data) >= 4:
|
|
template = func_data[3]
|
|
else:
|
|
# Skip functions with no template (shouldn't be auto-generated)
|
|
if not op in templates:
|
|
continue
|
|
template = templates[op]
|
|
|
|
# Expand all arguments
|
|
return_types = []
|
|
for ret in func_data[1]:
|
|
for t in types[ret]:
|
|
return_types.append(t)
|
|
signatures = []
|
|
for args in func_data[2]:
|
|
expanded_arg = []
|
|
for arg in args:
|
|
for t in types[arg]:
|
|
expanded_arg.append(t)
|
|
signatures.append(expanded_arg)
|
|
|
|
# Put arguments into substitution structure
|
|
num_functions = 0
|
|
for args in signatures:
|
|
num_functions = max(num_functions, len(args))
|
|
num_functions = max(num_functions, len(return_types))
|
|
num_args = len(signatures)
|
|
|
|
# Validate the input is correct
|
|
if len(return_types) != 1 and len(return_types) != num_functions:
|
|
print "Invalid Declaration: " + func_data
|
|
sys.exit(1)
|
|
|
|
for args in signatures:
|
|
if len(args) != 1 and len(args) != num_functions:
|
|
print "Invalid Declaration: " + func_data
|
|
sys.exit(1)
|
|
|
|
# Iterate over every function signature to generate
|
|
for i in range(0, num_functions):
|
|
if len(return_types) == 1:
|
|
return_type = return_types[0]
|
|
else:
|
|
return_type = return_types[i]
|
|
|
|
arg_types = []
|
|
for j in range(0, num_args):
|
|
if len(signatures[j]) == 1:
|
|
arg_types.append(signatures[j][0])
|
|
else:
|
|
arg_types.append(signatures[j][i])
|
|
|
|
# At this point, 'return_type' is a single type and 'arg_types'
|
|
# is a list of single types
|
|
sub = initialize_sub(op, return_type, arg_types)
|
|
|
|
h_file.write(header_template.substitute(sub))
|
|
cc_file.write(template.substitute(sub))
|
|
python_file.write(python_template.substitute(sub))
|
|
|
|
h_file.write(h_epilogue)
|
|
cc_file.write(cc_epilogue)
|
|
python_file.write(python_epilogue)
|
|
h_file.close()
|
|
cc_file.close()
|
|
python_file.close()
|