Files
doris/gensrc/script/gen_vector_functions.py
Zhengguo Yang ed3ff470ce [ARRAY] Support array type load and select not include access by index (#5980)
This is part of the array type support and has not been fully completed. 
The following functions are implemented
1. fe array type support and implementation of array function, support array syntax analysis and planning
2. Support import array type data through insert into
3. Support select array type data
4. Only the array type is supported on the value lie of the duplicate table

this pr merge some code from #4655 #4650 #4644 #4643 #4623 #2979
2021-07-13 14:02:39 +08:00

561 lines
19 KiB
Python
Executable File

#!/usr/bin/env python
# encoding: utf-8
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
"""
# This script will generate the implementation of the simple vector functions for the BE.
# These include:
# - Arithmetic functions
# - Binary functions
# - Cast functions
#
# The script outputs (run: 'src/common/function/gen_vector_functions.py')
# - header and implemention for above functions:
# - src/gen_cpp/opcode/vector_functions.[h/cc]
# - python file that contains the metadata for those functions:
# - src/gen_cpp/generated_vector_functions.py
"""
import string
import os
filter_binary_op = string.Template("\
bool VectorComputeFunctions::${fn_signature}(\n\
Expr* expr, VectorizedRowBatch* batch) {\n\
int n = batch->size();\n\
if (0 == n) {\n\
return false;\n\
}\n\
int* sel = batch->selected();\n\
Expr* op1 = expr->children()[0];\n\
Expr* op2 = expr->children()[1];\n\
batch->add_column(expr->output_column(), expr->type());\n\
if (expr->is_constant()) {\n\
${native_type1}* val1 = reinterpret_cast<${native_type1}*>(op1->get_value(NULL));\n\
${native_type2}* val2 = reinterpret_cast<${native_type2}*>(op2->get_value(NULL));\n\
if (val1 == NULL || val2 == NULL) return false;\n\
if (!(*val1 ${native_op} *val2)) batch->set_size(0);\n\
} else if (op1->is_constant()) {\n\
${native_type1}* value = reinterpret_cast<${native_type1}*>(op1->get_value(NULL));\n\
if (NULL == value || !op2->evaluate(batch)) return false;\n\
${native_type1}* vector1\n\
= reinterpret_cast<${native_type1}*>(batch->column(op2->output_column())->col_data());\n\
\n\
int new_size = 0;\n\
if (batch->selected_in_use()) {\n\
for (int j = 0; j != n; ++j) {\n\
int i = sel[j];\n\
if (*value ${native_op} vector1[i]) {\n\
sel[new_size++] = i;\n\
}\n\
}\n\
batch->set_size(new_size);\n\
} else {\n\
for (int i = 0; i != n; ++i) {\n\
if (*value ${native_op} vector1[i]) {\n\
sel[new_size++] = i;\n\
}\n\
}\n\
\n\
if (new_size < n) {\n\
batch->set_size(new_size);\n\
batch->set_selected_in_use(true);\n\
}\n\
}\n\
} else if (op2->is_constant()) {\n\
${native_type2}* value = reinterpret_cast<${native_type2}*>(op2->get_value(NULL));\n\
if (NULL == value || !op1->evaluate(batch)) return false;\n\
${native_type1}* vector1\n\
= reinterpret_cast<${native_type1}*>(batch->column(op1->output_column())->col_data());\n\
\n\
int new_size = 0;\n\
if (batch->selected_in_use()) {\n\
for (int j = 0; j != n; ++j) {\n\
int i = sel[j];\n\
if (vector1[i] ${native_op} *value) {\n\
sel[new_size++] = i;\n\
}\n\
}\n\
batch->set_size(new_size);\n\
} else {\n\
for (int i = 0; i != n; ++i) {\n\
if (vector1[i] ${native_op} *value) {\n\
sel[new_size++] = i;\n\
}\n\
}\n\
\n\
if (new_size < n) {\n\
batch->set_size(new_size);\n\
batch->set_selected_in_use(true);\n\
}\n\
}\n\
} else {\n\
if (!op1->evaluate(batch) || !op2->evaluate(batch)) return false;\n\
${native_type1}* vector1\n\
= reinterpret_cast<${native_type1}*>(batch->column(op1->output_column())->col_data());\n\
${native_type2}* vector2\n\
= reinterpret_cast<${native_type2}*>(batch->column(op2->output_column())->col_data());\n\
\n\
int new_size = 0;\n\
if (batch->selected_in_use()) {\n\
for (int j = 0; j != n; ++j) {\n\
int i = sel[j];\n\
if (vector1[i] ${native_op} vector2[i]) {\n\
sel[new_size++] = i;\n\
}\n\
}\n\
batch->set_size(new_size);\n\
} else {\n\
for (int i = 0; i != n; ++i) {\n\
if (vector1[i] ${native_op} vector2[i]) {\n\
sel[new_size++] = i;\n\
}\n\
}\n\
if (new_size < n) {\n\
batch->set_size(new_size);\n\
batch->set_selected_in_use(true);\n\
}\n\
}\n\
}\n\
return true;\n\
}\n\n")
filter_in_op = string.Template("\
bool VectorComputeFunctions::${fn_signature}(\n\
Expr* expr, VectorizedRowBatch* batch) {\n\
int n = batch->size();\n\
if (0 == n) {\n\
return true;\n\
}\n\
batch->add_column(expr->output_column(), expr->type());\n\
int* sel = batch->selected();\n\
int num_children = expr->get_num_children();\n\
Expr* op1 = expr->children()[0];\n\
InPredicate *in_pred = static_cast<InPredicate*>(expr);\n\
\n\
if (op1->is_constant()) {\n\
void* value = op1->get_value(NULL);\n\
if (!in_pred->hybird_set()->find(value)) {\n\
batch->set_size(0);\n\
return true;\n\
}\n\
\n\
if (num_children > 1) {\n\
${native_type1}* v = reinterpret_cast<${native_type1}*>(value);\n\
${native_type1}* vectors[num_children];\n\
for (int i = 1; i < num_children; ++i) {\n\
if (expr->get_child(i)->evaluate(batch)) return false;\n\
vectors[i] = reinterpret_cast<${native_type1}*>(batch->column(expr->get_child(i)->output_column())->col_data());\n\
}\n\
\n\
int new_size = 0;\n\
if (batch->selected_in_use()) {\n\
for (int j = 0; j != n; ++j) {\n\
int i = sel[j];\n\
for (int k = 1; k < num_children; ++k) {\n\
if (*v == vectors[k][i]) {\n\
sel[new_size++] = i;\n\
break;\n\
}\n\
}\n\
}\n\
batch->set_size(new_size);\n\
} else {\n\
for (int i = 0; i != n; ++i) {\n\
for (int k = 1; k < num_children; ++k) {\n\
if (*v == vectors[k][i]) {\n\
sel[new_size++] = i;\n\
break;\n\
}\n\
}\n\
}\n\
\n\
if (new_size < n) {\n\
batch->set_size(new_size);\n\
batch->set_selected_in_use(true);\n\
}\n\
}\n\
}\n\
} else {\n\
int c1 = op1->evaluate(batch);\n\
DCHECK(c1 >= 0);\n\
${native_type1}* vector1 \n\
=reinterpret_cast<${native_type1}*>(batch->column(op1->output_column())->col_data());\n\
if (0 != in_pred->hybird_set()->size()) {\n\
int new_size = 0;\n\
if (batch->selected_in_use()) {\n\
for (int j = 0; j != n; ++j) {\n\
int i = sel[j];\n\
if (in_pred->hybird_set()->find(&vector1[i])) {\n\
sel[new_size++] = i;\n\
}\n\
}\n\
batch->set_size(new_size);\n\
} else {\n\
for (int i = 0; i != n; ++i) {\n\
if (in_pred->hybird_set()->find(&vector1[i])) {\n\
sel[new_size++] = i;\n\
}\n\
}\n\
\n\
if (new_size < n) {\n\
batch->set_size(new_size);\n\
batch->set_selected_in_use(true);\n\
}\n\
}\n\
}\n\
\n\
if (num_children > 1) {\n\
${native_type1}* vectors[num_children];\n\
for (int i = 1; i < num_children; ++i) {\n\
if (!expr->get_child(i)->evaluate(batch)) return false;\n\
vectors[i] = reinterpret_cast<${native_type1}*>(batch->column(expr->get_child(i)->output_column())->col_data());\n\
}\n\
\n\
int new_size = 0;\n\
if (batch->selected_in_use()) {\n\
for (int j = 0; j != n; ++j) {\n\
int i = sel[j];\n\
for (int k = 1; k < num_children; ++k) {\n\
if (vector1[i] == vectors[k][i]) {\n\
sel[new_size++] = i;\n\
break;\n\
}\n\
}\n\
}\n\
batch->set_size(new_size);\n\
} else {\n\
for (int i = 0; i != n; ++i) {\n\
for (int k = 1; k < num_children; ++k) {\n\
if (vector1[i] == vectors[k][i]) {\n\
sel[new_size++] = i;\n\
break;\n\
}\n\
}\n\
}\n\
\n\
if (new_size < n) {\n\
batch->set_size(new_size);\n\
batch->set_selected_in_use(true);\n\
}\n\
}\n\
}\n\
}\n\
return true;\n\
}\n\n")
python_template = string.Template("\
['${fn_name}', '${return_type}', [${args}], 'VectorComputeFunctions::${fn_signature}', []], \n")
# Mapping of function to template
templates = {
'Filter_Eq': filter_binary_op,
'Filter_Ne': filter_binary_op,
'Filter_Gt': filter_binary_op,
'Filter_Lt': filter_binary_op,
'Filter_Ge': filter_binary_op,
'Filter_Le': filter_binary_op,
'Filter_In': filter_in_op,
}
# Some aggregate types that are useful for defining functions
types = {
'BOOLEAN': ['BOOLEAN'],
'TINYINT': ['TINYINT'],
'SMALLINT': ['SMALLINT'],
'INT': ['INT'],
'BIGINT': ['BIGINT'],
'LARGEINT': ['LARGEINT'],
'FLOAT': ['FLOAT'],
'DOUBLE': ['DOUBLE'],
'STRING': ['VARCHAR'],
'DATE': ['DATE'],
'DATETIME': ['DATETIME'],
'DECIMALV2': ['DECIMALV2'],
'NATIVE_INT_TYPES': ['TINYINT', 'SMALLINT', 'INT', 'BIGINT'],
'INT_TYPES': ['TINYINT', 'SMALLINT', 'INT', 'BIGINT', 'LARGEINT'],
'FLOAT_TYPES': ['FLOAT', 'DOUBLE'],
'NUMERIC_TYPES': ['TINYINT', 'SMALLINT', 'INT', 'BIGINT', 'FLOAT', 'DOUBLE'],
'NATIVE_TYPES': ['BOOLEAN', 'TINYINT', 'SMALLINT', 'INT', 'BIGINT', 'FLOAT', 'DOUBLE'],
'STRCAST_TYPES': ['BOOLEAN', 'SMALLINT', 'INT', 'BIGINT', 'FLOAT', 'DOUBLE'],
'ALL_TYPES': ['BOOLEAN', 'TINYINT', 'SMALLINT', 'INT', 'BIGINT', 'LARGEINT', 'FLOAT',\
'DOUBLE', 'VARCHAR', 'DATETIME', 'DECIMALV2'],
'MAX_TYPES': ['BIGINT', 'LARGEINT', 'DOUBLE', 'DECIMALV2'],
}
# Operation, [ReturnType], [[Args1], [Args2], ... [ArgsN]]
functions = [
# BinaryPredicates
['Filter_Eq', ['BOOLEAN'], [['ALL_TYPES'], ['ALL_TYPES']]],
['Filter_Ne', ['BOOLEAN'], [['ALL_TYPES'], ['ALL_TYPES']]],
['Filter_Gt', ['BOOLEAN'], [['ALL_TYPES'], ['ALL_TYPES']]],
['Filter_Lt', ['BOOLEAN'], [['ALL_TYPES'], ['ALL_TYPES']]],
['Filter_Ge', ['BOOLEAN'], [['ALL_TYPES'], ['ALL_TYPES']]],
['Filter_Le', ['BOOLEAN'], [['ALL_TYPES'], ['ALL_TYPES']]],
# InPredicates
['Filter_In', ['BOOLEAN'], [['ALL_TYPES']]],
]
native_types = {
'BOOLEAN': 'bool',
'TINYINT': 'char',
'SMALLINT': 'short',
'INT': 'int',
'BIGINT': 'long',
'LARGEINT': '__int128',
'FLOAT': 'float',
'DOUBLE': 'double',
'VARCHAR': 'StringValue',
'DATE': 'DateTimeValue',
'DATETIME': 'DateTimeValue',
'DECIMALV2': 'DecimalV2Value',
}
# Portable type used in the function implementation
implemented_types = {
'BOOLEAN': 'bool',
'TINYINT': 'int8_t',
'SMALLINT': 'int16_t',
'INT': 'int32_t',
'BIGINT': 'int64_t',
'LARGEINT': '__int128',
'FLOAT': 'float',
'DOUBLE': 'double',
'VARCHAR': 'StringValue',
'DATE': 'DateTimeValue',
'DATETIME': 'DateTimeValue',
'DECIMALV2': 'DecimalV2Value',
}
native_ops = {
'Filter_Eq': '==',
'Filter_Ne': '!=',
'Filter_Gt': '>',
'Filter_Lt': '<',
'Filter_Ge': '>=',
'Filter_Le': '<=',
'Eq': '==',
'Ne': '!=',
'Gt': '>',
'Lt': '<',
'Ge': '>=',
'Le': '<=',
'BITAND': '&',
'BITNOT': '~',
'BITOR': '|',
'BITXOR': '^',
'DIVIDE': '/',
'EQ': '==',
'GT': '>',
'GE': '>=',
'INT_DIVIDE': '/',
'SUBTRACT': '-',
'MOD': '%',
'MULTIPLY': '*',
'LT': '<',
'LE': '<=',
'NE': '!=',
'ADD': '+',
}
native_funcs = {
'EQ': 'Eq',
'LE': 'Le',
'LT': 'Lt',
'NE': 'Ne',
'GE': 'Ge',
'GT': 'Gt',
}
cc_preamble = '\
// This is a generated file, DO NOT EDIT IT.\n\
// To add new functions, see impala/common/function-registry/gen_vector_functions.py\n\
\n\
#include "gen_cpp/opcode/vector-functions.h"\n\
#include "exprs/case_expr.h"\n\
#include "exprs/expr.h"\n\
#include "exprs/in_predicate.h"\n\
#include "runtime/string_value.hpp"\n\
#include "runtime/vectorized_row_batch.h"\n\
#include "util/string_parser.hpp"\n\
#include <boost/lexical_cast.hpp>\n\
\n\
using namespace boost;\n\
using namespace std;\n\
\n\
namespace doris { \n\
\n'
cc_epilogue = '\
}\n'
h_preamble = '\
// This is a generated file, DO NOT EDIT IT.\n\
// To add new functions, see impala/common/function-registry/gen_vector_functions.py\n\
\n\
#ifndef DORIS_OPCODE_VECTOR_FUNCTIONS_H\n\
#define DORIS_OPCODE_VECTOR_FUNCTIONS_H\n\
\n\
namespace doris {\n\
class Expr;\n\
class OpcodeRegistry;\n\
class VectorizedRowBatch;\n\
\n\
class VectorComputeFunctions {\n\
public:\n'
h_epilogue = '\
};\n\
\n\
}\n\
\n\
#endif\n'
python_preamble = '\
#!/usr/bin/env python\n\
# Licensed to the Apache Software Foundation (ASF) under one \n\
# or more contributor license agreements. See the NOTICE file \n\
# distributed with this work for additional information \n\
# regarding copyright ownership. The ASF licenses this file \n\
# to you under the Apache License, Version 2.0 (the \n\
# "License"); you may not use this file except in compliance \n\
# with the License. You may obtain a copy of the License at \n\
# \n\
# http://www.apache.org/licenses/LICENSE-2.0\n\
# \n\
# Unless required by applicable law or agreed to in writing, software\n\
# distributed under the License is distributed on an "AS IS" BASIS,\n\
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n\
# See the License for the specific language governing permissions and\n\
# limitations under the License.\n\
\n\
# This is a generated file, DO NOT EDIT IT.\n\
# To add new functions, see impala/common/function-registry/gen_opcodes.py\n\
\n\
functions = [\n'
python_epilogue = ']'
header_template = string.Template("\
static bool ${fn_signature}(\n\
Expr* e, VectorizedRowBatch* batch);\n")
BE_PATH = "../gen_cpp/opcode/"
if not os.path.exists(BE_PATH):
os.makedirs(BE_PATH)
def initialize_sub(op, return_type, arg_types):
"""
Expand the signature data for template substitution. Returns
a dictionary with all the entries for all the templates used in this script
"""
sub = {}
sub["fn_name"] = op
sub["fn_signature"] = op
sub["return_type"] = return_type
sub["args"] = ""
if op in native_ops:
sub["native_op"] = native_ops[op]
for idx in range(0, len(arg_types)):
arg = arg_types[idx]
sub["fn_signature"] += "_" + native_types[arg]
sub["native_type" + repr(idx + 1)] = implemented_types[arg]
sub["args"] += "'" + arg + "', "
return sub
if __name__ == "__main__":
h_file = open(BE_PATH + 'vector-functions.h', 'w')
cc_file = open(BE_PATH + 'vector-functions.cc', 'w')
python_file = open('generated_vector_functions.py', 'w')
h_file.write(h_preamble)
cc_file.write(cc_preamble)
python_file.write(python_preamble)
# Generate functions and headers
for func_data in functions:
op = func_data[0]
# If a specific template has been specified, use that one.
if len(func_data) >= 4:
template = func_data[3]
else:
# Skip functions with no template (shouldn't be auto-generated)
if not op in templates:
continue
template = templates[op]
# Expand all arguments
return_types = []
for ret in func_data[1]:
for t in types[ret]:
return_types.append(t)
signatures = []
for args in func_data[2]:
expanded_arg = []
for arg in args:
for t in types[arg]:
expanded_arg.append(t)
signatures.append(expanded_arg)
# Put arguments into substitution structure
num_functions = 0
for args in signatures:
num_functions = max(num_functions, len(args))
num_functions = max(num_functions, len(return_types))
num_args = len(signatures)
# Validate the input is correct
if len(return_types) != 1 and len(return_types) != num_functions:
print("Invalid Declaration: " + func_data)
sys.exit(1)
for args in signatures:
if len(args) != 1 and len(args) != num_functions:
print("Invalid Declaration: " + func_data)
sys.exit(1)
# Iterate over every function signature to generate
for i in range(0, num_functions):
if len(return_types) == 1:
return_type = return_types[0]
else:
return_type = return_types[i]
arg_types = []
for j in range(0, num_args):
if len(signatures[j]) == 1:
arg_types.append(signatures[j][0])
else:
arg_types.append(signatures[j][i])
# At this point, 'return_type' is a single type and 'arg_types'
# is a list of single types
sub = initialize_sub(op, return_type, arg_types)
h_file.write(header_template.substitute(sub))
cc_file.write(template.substitute(sub))
python_file.write(python_template.substitute(sub))
h_file.write(h_epilogue)
cc_file.write(cc_epilogue)
python_file.write(python_epilogue)
h_file.close()
cc_file.close()
python_file.close()