[refactor](jni) unified jni framework for java udf (#25302)
Use the unified jni framework to refactor java udf. The unified jni framework takes VectorTable as the container to transform data between c++ and java, and hide the details of data format conversion. In addition, the unified framework supports complex and nested types. The performance of basic types remains consistent, with a 30% improvement in string types and an order of magnitude improvement in complex types.
This commit is contained in:
@ -42,10 +42,11 @@
|
||||
#include "vec/core/block.h"
|
||||
#include "vec/data_types/data_type_array.h"
|
||||
#include "vec/data_types/data_type_nullable.h"
|
||||
#include "vec/exec/jni_connector.h"
|
||||
|
||||
const char* EXECUTOR_CLASS = "org/apache/doris/udf/UdfExecutor";
|
||||
const char* EXECUTOR_CTOR_SIGNATURE = "([B)V";
|
||||
const char* EXECUTOR_EVALUATE_SIGNATURE = "()V";
|
||||
const char* EXECUTOR_EVALUATE_SIGNATURE = "(Ljava/util/Map;Ljava/util/Map;)J";
|
||||
const char* EXECUTOR_CLOSE_SIGNATURE = "()V";
|
||||
|
||||
namespace doris::vectorized {
|
||||
@ -65,21 +66,8 @@ Status JavaFunctionCall::open(FunctionContext* context, FunctionContext::Functio
|
||||
jni_env->executor_ctor_id =
|
||||
env->GetMethodID(jni_env->executor_cl, "<init>", EXECUTOR_CTOR_SIGNATURE);
|
||||
RETURN_ERROR_IF_EXC(env);
|
||||
jni_env->executor_evaluate_id = env->GetMethodID(
|
||||
jni_env->executor_cl, "evaluate", "(I[Ljava/lang/Object;)[Ljava/lang/Object;");
|
||||
|
||||
jni_env->executor_convert_basic_argument_id = env->GetMethodID(
|
||||
jni_env->executor_cl, "convertBasicArguments", "(IZIJJJ)[Ljava/lang/Object;");
|
||||
jni_env->executor_convert_array_argument_id = env->GetMethodID(
|
||||
jni_env->executor_cl, "convertArrayArguments", "(IZIJJJJJ)[Ljava/lang/Object;");
|
||||
jni_env->executor_convert_map_argument_id = env->GetMethodID(
|
||||
jni_env->executor_cl, "convertMapArguments", "(IZIJJJJJJJJ)[Ljava/lang/Object;");
|
||||
jni_env->executor_result_basic_batch_id = env->GetMethodID(
|
||||
jni_env->executor_cl, "copyBatchBasicResult", "(ZI[Ljava/lang/Object;JJJ)V");
|
||||
jni_env->executor_result_array_batch_id = env->GetMethodID(
|
||||
jni_env->executor_cl, "copyBatchArrayResult", "(ZI[Ljava/lang/Object;JJJJJ)V");
|
||||
jni_env->executor_result_map_batch_id = env->GetMethodID(
|
||||
jni_env->executor_cl, "copyBatchMapResult", "(ZI[Ljava/lang/Object;JJJJJJJJ)V");
|
||||
jni_env->executor_evaluate_id =
|
||||
env->GetMethodID(jni_env->executor_cl, "evaluate", EXECUTOR_EVALUATE_SIGNATURE);
|
||||
jni_env->executor_close_id =
|
||||
env->GetMethodID(jni_env->executor_cl, "close", EXECUTOR_CLOSE_SIGNATURE);
|
||||
RETURN_ERROR_IF_EXC(env);
|
||||
@ -132,288 +120,29 @@ Status JavaFunctionCall::execute_impl(FunctionContext* context, Block& block,
|
||||
context->get_function_state(FunctionContext::THREAD_LOCAL));
|
||||
JniEnv* jni_env =
|
||||
reinterpret_cast<JniEnv*>(context->get_function_state(FunctionContext::FRAGMENT_LOCAL));
|
||||
int arg_size = arguments.size();
|
||||
ColumnPtr data_cols[arg_size];
|
||||
ColumnPtr null_cols[arg_size];
|
||||
jclass obj_class = env->FindClass("[Ljava/lang/Object;");
|
||||
jclass arraylist_class = env->FindClass("Ljava/util/ArrayList;");
|
||||
jclass hashmap_class = env->FindClass("Ljava/util/HashMap;");
|
||||
jobjectArray arg_objects = env->NewObjectArray(arg_size, obj_class, nullptr);
|
||||
int64_t nullmap_address = 0;
|
||||
for (size_t arg_idx = 0; arg_idx < arg_size; ++arg_idx) {
|
||||
bool arg_column_nullable = false;
|
||||
// get argument column and type
|
||||
ColumnWithTypeAndName& column = block.get_by_position(arguments[arg_idx]);
|
||||
auto column_type = column.type;
|
||||
data_cols[arg_idx] = column.column->convert_to_full_column_if_const();
|
||||
|
||||
// check type
|
||||
DCHECK(_argument_types[arg_idx]->equals(*column_type))
|
||||
<< " input column's type is " + column_type->get_name()
|
||||
<< " does not equal to required type " << _argument_types[arg_idx]->get_name();
|
||||
|
||||
// get argument null map and nested column
|
||||
if (auto* nullable = check_and_get_column<const ColumnNullable>(*data_cols[arg_idx])) {
|
||||
arg_column_nullable = true;
|
||||
column_type = remove_nullable(column_type);
|
||||
null_cols[arg_idx] = nullable->get_null_map_column_ptr();
|
||||
data_cols[arg_idx] = nullable->get_nested_column_ptr();
|
||||
nullmap_address = reinterpret_cast<int64_t>(
|
||||
check_and_get_column<ColumnVector<UInt8>>(null_cols[arg_idx])
|
||||
->get_data()
|
||||
.data());
|
||||
}
|
||||
|
||||
// convert argument column data into java type
|
||||
jobjectArray arr_obj = nullptr;
|
||||
if (data_cols[arg_idx]->is_numeric() || data_cols[arg_idx]->is_column_decimal()) {
|
||||
arr_obj = (jobjectArray)env->CallNonvirtualObjectMethod(
|
||||
jni_ctx->executor, jni_env->executor_cl,
|
||||
jni_env->executor_convert_basic_argument_id, arg_idx, arg_column_nullable,
|
||||
num_rows, nullmap_address,
|
||||
reinterpret_cast<int64_t>(data_cols[arg_idx]->get_raw_data().data), 0);
|
||||
} else if (data_cols[arg_idx]->is_column_string()) {
|
||||
const ColumnString* str_col =
|
||||
assert_cast<const ColumnString*>(data_cols[arg_idx].get());
|
||||
arr_obj = (jobjectArray)env->CallNonvirtualObjectMethod(
|
||||
jni_ctx->executor, jni_env->executor_cl,
|
||||
jni_env->executor_convert_basic_argument_id, arg_idx, arg_column_nullable,
|
||||
num_rows, nullmap_address,
|
||||
reinterpret_cast<int64_t>(str_col->get_chars().data()),
|
||||
reinterpret_cast<int64_t>(str_col->get_offsets().data()));
|
||||
} else if (data_cols[arg_idx]->is_column_array()) {
|
||||
const ColumnArray* array_col =
|
||||
assert_cast<const ColumnArray*>(data_cols[arg_idx].get());
|
||||
const ColumnNullable& array_nested_nullable =
|
||||
assert_cast<const ColumnNullable&>(array_col->get_data());
|
||||
auto data_column_null_map = array_nested_nullable.get_null_map_column_ptr();
|
||||
auto data_column = array_nested_nullable.get_nested_column_ptr();
|
||||
auto offset_address =
|
||||
reinterpret_cast<int64_t>(array_col->get_offsets_column().get_raw_data().data);
|
||||
auto nested_nullmap_address = reinterpret_cast<int64_t>(
|
||||
check_and_get_column<ColumnVector<UInt8>>(data_column_null_map)
|
||||
->get_data()
|
||||
.data());
|
||||
int64_t nested_data_address = 0, nested_offset_address = 0;
|
||||
// array type need pass address: [nullmap_address], offset_address, nested_nullmap_address, nested_data_address/nested_char_address,nested_offset_address
|
||||
if (data_column->is_column_string()) {
|
||||
const ColumnString* col = assert_cast<const ColumnString*>(data_column.get());
|
||||
nested_data_address = reinterpret_cast<int64_t>(col->get_chars().data());
|
||||
nested_offset_address = reinterpret_cast<int64_t>(col->get_offsets().data());
|
||||
} else {
|
||||
nested_data_address = reinterpret_cast<int64_t>(data_column->get_raw_data().data);
|
||||
}
|
||||
arr_obj = (jobjectArray)env->CallNonvirtualObjectMethod(
|
||||
jni_ctx->executor, jni_env->executor_cl,
|
||||
jni_env->executor_convert_array_argument_id, arg_idx, arg_column_nullable,
|
||||
num_rows, nullmap_address, offset_address, nested_nullmap_address,
|
||||
nested_data_address, nested_offset_address);
|
||||
} else if (data_cols[arg_idx]->is_column_map()) {
|
||||
const ColumnMap* map_col = assert_cast<const ColumnMap*>(data_cols[arg_idx].get());
|
||||
auto offset_address =
|
||||
reinterpret_cast<int64_t>(map_col->get_offsets_column().get_raw_data().data);
|
||||
const ColumnNullable& map_key_column_nullable =
|
||||
assert_cast<const ColumnNullable&>(map_col->get_keys());
|
||||
auto key_data_column_null_map = map_key_column_nullable.get_null_map_column_ptr();
|
||||
auto key_data_column = map_key_column_nullable.get_nested_column_ptr();
|
||||
|
||||
auto key_nested_nullmap_address = reinterpret_cast<int64_t>(
|
||||
check_and_get_column<ColumnVector<UInt8>>(key_data_column_null_map)
|
||||
->get_data()
|
||||
.data());
|
||||
int64_t key_nested_data_address = 0, key_nested_offset_address = 0;
|
||||
if (key_data_column->is_column_string()) {
|
||||
const ColumnString* col = assert_cast<const ColumnString*>(key_data_column.get());
|
||||
key_nested_data_address = reinterpret_cast<int64_t>(col->get_chars().data());
|
||||
key_nested_offset_address = reinterpret_cast<int64_t>(col->get_offsets().data());
|
||||
} else {
|
||||
key_nested_data_address =
|
||||
reinterpret_cast<int64_t>(key_data_column->get_raw_data().data);
|
||||
}
|
||||
|
||||
const ColumnNullable& map_value_column_nullable =
|
||||
assert_cast<const ColumnNullable&>(map_col->get_values());
|
||||
auto value_data_column_null_map = map_value_column_nullable.get_null_map_column_ptr();
|
||||
auto value_data_column = map_value_column_nullable.get_nested_column_ptr();
|
||||
auto value_nested_nullmap_address = reinterpret_cast<int64_t>(
|
||||
check_and_get_column<ColumnVector<UInt8>>(value_data_column_null_map)
|
||||
->get_data()
|
||||
.data());
|
||||
int64_t value_nested_data_address = 0, value_nested_offset_address = 0;
|
||||
if (value_data_column->is_column_string()) {
|
||||
const ColumnString* col = assert_cast<const ColumnString*>(value_data_column.get());
|
||||
value_nested_data_address = reinterpret_cast<int64_t>(col->get_chars().data());
|
||||
value_nested_offset_address = reinterpret_cast<int64_t>(col->get_offsets().data());
|
||||
} else {
|
||||
value_nested_data_address =
|
||||
reinterpret_cast<int64_t>(value_data_column->get_raw_data().data);
|
||||
}
|
||||
arr_obj = (jobjectArray)env->CallNonvirtualObjectMethod(
|
||||
jni_ctx->executor, jni_env->executor_cl,
|
||||
jni_env->executor_convert_map_argument_id, arg_idx, arg_column_nullable,
|
||||
num_rows, nullmap_address, offset_address, key_nested_nullmap_address,
|
||||
key_nested_data_address, key_nested_offset_address,
|
||||
value_nested_nullmap_address, value_nested_data_address,
|
||||
value_nested_offset_address);
|
||||
} else {
|
||||
return Status::InvalidArgument(
|
||||
strings::Substitute("Java UDF doesn't support type $0 now !",
|
||||
_argument_types[arg_idx]->get_name()));
|
||||
}
|
||||
|
||||
env->SetObjectArrayElement(arg_objects, arg_idx, arr_obj);
|
||||
env->DeleteLocalRef(arr_obj);
|
||||
}
|
||||
std::unique_ptr<long[]> input_table;
|
||||
RETURN_IF_ERROR(JniConnector::to_java_table(&block, num_rows, arguments, input_table));
|
||||
auto input_table_schema = JniConnector::parse_table_schema(&block, arguments, true);
|
||||
std::map<String, String> input_params = {
|
||||
{"meta_address", std::to_string((long)input_table.get())},
|
||||
{"required_fields", input_table_schema.first},
|
||||
{"columns_types", input_table_schema.second}};
|
||||
jobject input_map = JniUtil::convert_to_java_map(env, input_params);
|
||||
auto output_table_schema = JniConnector::parse_table_schema(&block, {result}, true);
|
||||
std::string output_nullable =
|
||||
block.get_by_position(result).type->is_nullable() ? "true" : "false";
|
||||
std::map<String, String> output_params = {{"is_nullable", output_nullable},
|
||||
{"required_fields", output_table_schema.first},
|
||||
{"columns_types", output_table_schema.second}};
|
||||
jobject output_map = JniUtil::convert_to_java_map(env, output_params);
|
||||
long output_address = env->CallLongMethod(jni_ctx->executor, jni_env->executor_evaluate_id,
|
||||
input_map, output_map);
|
||||
env->DeleteLocalRef(input_map);
|
||||
env->DeleteLocalRef(output_map);
|
||||
RETURN_IF_ERROR(JniUtil::GetJniExceptionMsg(env));
|
||||
|
||||
// evaluate with argument object
|
||||
jobjectArray result_obj = (jobjectArray)env->CallNonvirtualObjectMethod(
|
||||
jni_ctx->executor, jni_env->executor_cl, jni_env->executor_evaluate_id, num_rows,
|
||||
arg_objects);
|
||||
env->DeleteLocalRef(arg_objects);
|
||||
RETURN_IF_ERROR(JniUtil::GetJniExceptionMsg(env));
|
||||
|
||||
auto return_type = block.get_data_type(result);
|
||||
bool result_nullable = return_type->is_nullable();
|
||||
ColumnUInt8::MutablePtr null_col = nullptr;
|
||||
if (result_nullable) {
|
||||
return_type = remove_nullable(return_type);
|
||||
null_col = ColumnUInt8::create(num_rows, 0);
|
||||
memset(null_col->get_data().data(), 0, num_rows);
|
||||
nullmap_address = reinterpret_cast<int64_t>(null_col->get_data().data());
|
||||
}
|
||||
auto res_col = return_type->create_column();
|
||||
res_col->resize(num_rows);
|
||||
|
||||
//could resize for column firstly, copy batch result into column
|
||||
if (res_col->is_numeric() || res_col->is_column_decimal()) {
|
||||
env->CallNonvirtualVoidMethod(jni_ctx->executor, jni_env->executor_cl,
|
||||
jni_env->executor_result_basic_batch_id, result_nullable,
|
||||
num_rows, result_obj, nullmap_address,
|
||||
reinterpret_cast<int64_t>(res_col->get_raw_data().data), 0);
|
||||
} else if (res_col->is_column_string()) {
|
||||
const ColumnString* str_col = assert_cast<const ColumnString*>(res_col.get());
|
||||
ColumnString::Chars& chars = const_cast<ColumnString::Chars&>(str_col->get_chars());
|
||||
ColumnString::Offsets& offsets = const_cast<ColumnString::Offsets&>(str_col->get_offsets());
|
||||
|
||||
env->CallNonvirtualVoidMethod(
|
||||
jni_ctx->executor, jni_env->executor_cl, jni_env->executor_result_basic_batch_id,
|
||||
result_nullable, num_rows, result_obj, nullmap_address,
|
||||
reinterpret_cast<int64_t>(&chars), reinterpret_cast<int64_t>(offsets.data()));
|
||||
} else if (res_col->is_column_array()) {
|
||||
ColumnArray* array_col = assert_cast<ColumnArray*>(res_col.get());
|
||||
ColumnNullable& array_nested_nullable = assert_cast<ColumnNullable&>(array_col->get_data());
|
||||
auto data_column_null_map = array_nested_nullable.get_null_map_column_ptr();
|
||||
auto data_column = array_nested_nullable.get_nested_column_ptr();
|
||||
auto& offset_column = array_col->get_offsets_column();
|
||||
auto offset_address = reinterpret_cast<int64_t>(offset_column.get_raw_data().data);
|
||||
auto& null_map_data =
|
||||
assert_cast<ColumnVector<UInt8>*>(data_column_null_map.get())->get_data();
|
||||
auto nested_nullmap_address = reinterpret_cast<int64_t>(null_map_data.data());
|
||||
jmethodID list_size = env->GetMethodID(arraylist_class, "size", "()I");
|
||||
int element_size = 0; // get all element size in num_rows of array column
|
||||
for (int i = 0; i < num_rows; ++i) {
|
||||
jobject obj = env->GetObjectArrayElement(result_obj, i);
|
||||
if (obj == nullptr) {
|
||||
continue;
|
||||
}
|
||||
element_size = element_size + env->CallIntMethod(obj, list_size);
|
||||
env->DeleteLocalRef(obj);
|
||||
}
|
||||
array_nested_nullable.resize(element_size);
|
||||
memset(null_map_data.data(), 0, element_size);
|
||||
int64_t nested_data_address = 0, nested_offset_address = 0;
|
||||
// array type need pass address: [nullmap_address], offset_address, nested_nullmap_address, nested_data_address/nested_char_address,nested_offset_address
|
||||
if (data_column->is_column_string()) {
|
||||
ColumnString* str_col = assert_cast<ColumnString*>(data_column.get());
|
||||
ColumnString::Chars& chars = assert_cast<ColumnString::Chars&>(str_col->get_chars());
|
||||
ColumnString::Offsets& offsets =
|
||||
assert_cast<ColumnString::Offsets&>(str_col->get_offsets());
|
||||
nested_data_address = reinterpret_cast<int64_t>(&chars);
|
||||
nested_offset_address = reinterpret_cast<int64_t>(offsets.data());
|
||||
} else {
|
||||
nested_data_address = reinterpret_cast<int64_t>(data_column->get_raw_data().data);
|
||||
}
|
||||
env->CallNonvirtualVoidMethod(
|
||||
jni_ctx->executor, jni_env->executor_cl, jni_env->executor_result_array_batch_id,
|
||||
result_nullable, num_rows, result_obj, nullmap_address, offset_address,
|
||||
nested_nullmap_address, nested_data_address, nested_offset_address);
|
||||
} else if (res_col->is_column_map()) {
|
||||
ColumnMap* map_col = assert_cast<ColumnMap*>(res_col.get());
|
||||
auto& offset_column = map_col->get_offsets_column();
|
||||
auto offset_address = reinterpret_cast<int64_t>(offset_column.get_raw_data().data);
|
||||
ColumnNullable& map_key_column_nullable = assert_cast<ColumnNullable&>(map_col->get_keys());
|
||||
auto key_data_column_null_map = map_key_column_nullable.get_null_map_column_ptr();
|
||||
auto key_data_column = map_key_column_nullable.get_nested_column_ptr();
|
||||
auto& key_null_map_data =
|
||||
assert_cast<ColumnVector<UInt8>*>(key_data_column_null_map.get())->get_data();
|
||||
auto key_nested_nullmap_address = reinterpret_cast<int64_t>(key_null_map_data.data());
|
||||
ColumnNullable& map_value_column_nullable =
|
||||
assert_cast<ColumnNullable&>(map_col->get_values());
|
||||
auto value_data_column_null_map = map_value_column_nullable.get_null_map_column_ptr();
|
||||
auto value_data_column = map_value_column_nullable.get_nested_column_ptr();
|
||||
auto& value_null_map_data =
|
||||
assert_cast<ColumnVector<UInt8>*>(value_data_column_null_map.get())->get_data();
|
||||
auto value_nested_nullmap_address = reinterpret_cast<int64_t>(value_null_map_data.data());
|
||||
jmethodID map_size = env->GetMethodID(hashmap_class, "size", "()I");
|
||||
int element_size = 0; // get all element size in num_rows of map column
|
||||
for (int i = 0; i < num_rows; ++i) {
|
||||
jobject obj = env->GetObjectArrayElement(result_obj, i);
|
||||
if (obj == nullptr) {
|
||||
continue;
|
||||
}
|
||||
element_size = element_size + env->CallIntMethod(obj, map_size);
|
||||
env->DeleteLocalRef(obj);
|
||||
}
|
||||
map_key_column_nullable.resize(element_size);
|
||||
memset(key_null_map_data.data(), 0, element_size);
|
||||
map_value_column_nullable.resize(element_size);
|
||||
memset(value_null_map_data.data(), 0, element_size);
|
||||
int64_t key_nested_data_address = 0, key_nested_offset_address = 0;
|
||||
if (key_data_column->is_column_string()) {
|
||||
ColumnString* str_col = assert_cast<ColumnString*>(key_data_column.get());
|
||||
ColumnString::Chars& chars = assert_cast<ColumnString::Chars&>(str_col->get_chars());
|
||||
ColumnString::Offsets& offsets =
|
||||
assert_cast<ColumnString::Offsets&>(str_col->get_offsets());
|
||||
key_nested_data_address = reinterpret_cast<int64_t>(&chars);
|
||||
key_nested_offset_address = reinterpret_cast<int64_t>(offsets.data());
|
||||
} else {
|
||||
key_nested_data_address =
|
||||
reinterpret_cast<int64_t>(key_data_column->get_raw_data().data);
|
||||
}
|
||||
int64_t value_nested_data_address = 0, value_nested_offset_address = 0;
|
||||
if (value_data_column->is_column_string()) {
|
||||
ColumnString* str_col = assert_cast<ColumnString*>(value_data_column.get());
|
||||
ColumnString::Chars& chars = assert_cast<ColumnString::Chars&>(str_col->get_chars());
|
||||
ColumnString::Offsets& offsets =
|
||||
assert_cast<ColumnString::Offsets&>(str_col->get_offsets());
|
||||
value_nested_data_address = reinterpret_cast<int64_t>(&chars);
|
||||
value_nested_offset_address = reinterpret_cast<int64_t>(offsets.data());
|
||||
} else {
|
||||
value_nested_data_address =
|
||||
reinterpret_cast<int64_t>(value_data_column->get_raw_data().data);
|
||||
}
|
||||
env->CallNonvirtualVoidMethod(jni_ctx->executor, jni_env->executor_cl,
|
||||
jni_env->executor_result_map_batch_id, result_nullable,
|
||||
num_rows, result_obj, nullmap_address, offset_address,
|
||||
key_nested_nullmap_address, key_nested_data_address,
|
||||
key_nested_offset_address, value_nested_nullmap_address,
|
||||
value_nested_data_address, value_nested_offset_address);
|
||||
} else {
|
||||
return Status::InvalidArgument(strings::Substitute(
|
||||
"Java UDF doesn't support return type $0 now !", return_type->get_name()));
|
||||
}
|
||||
env->DeleteLocalRef(result_obj);
|
||||
env->DeleteLocalRef(obj_class);
|
||||
env->DeleteLocalRef(arraylist_class);
|
||||
env->DeleteLocalRef(hashmap_class);
|
||||
if (result_nullable) {
|
||||
block.replace_by_position(result,
|
||||
ColumnNullable::create(std::move(res_col), std::move(null_col)));
|
||||
} else {
|
||||
block.replace_by_position(result, std::move(res_col));
|
||||
}
|
||||
return JniUtil::GetJniExceptionMsg(env);
|
||||
return JniConnector::fill_block(&block, {result}, output_address);
|
||||
}
|
||||
|
||||
Status JavaFunctionCall::close(FunctionContext* context,
|
||||
|
||||
Reference in New Issue
Block a user