[refactor](jni) unified jni framework for java udf (#25302)

Use the unified jni framework to refactor java udf.
The unified jni framework takes VectorTable as the container to transform data between c++ and java, and hide the details of data format conversion.
In addition, the unified framework supports complex and nested types.
The performance of basic types remains consistent, with a 30% improvement in string types and an order of magnitude improvement in complex types.
This commit is contained in:
Ashin Gau
2023-10-18 09:27:54 +08:00
committed by GitHub
parent 26e332c608
commit 47689fd452
23 changed files with 2153 additions and 742 deletions

View File

@ -42,10 +42,11 @@
#include "vec/core/block.h"
#include "vec/data_types/data_type_array.h"
#include "vec/data_types/data_type_nullable.h"
#include "vec/exec/jni_connector.h"
const char* EXECUTOR_CLASS = "org/apache/doris/udf/UdfExecutor";
const char* EXECUTOR_CTOR_SIGNATURE = "([B)V";
const char* EXECUTOR_EVALUATE_SIGNATURE = "()V";
const char* EXECUTOR_EVALUATE_SIGNATURE = "(Ljava/util/Map;Ljava/util/Map;)J";
const char* EXECUTOR_CLOSE_SIGNATURE = "()V";
namespace doris::vectorized {
@ -65,21 +66,8 @@ Status JavaFunctionCall::open(FunctionContext* context, FunctionContext::Functio
jni_env->executor_ctor_id =
env->GetMethodID(jni_env->executor_cl, "<init>", EXECUTOR_CTOR_SIGNATURE);
RETURN_ERROR_IF_EXC(env);
jni_env->executor_evaluate_id = env->GetMethodID(
jni_env->executor_cl, "evaluate", "(I[Ljava/lang/Object;)[Ljava/lang/Object;");
jni_env->executor_convert_basic_argument_id = env->GetMethodID(
jni_env->executor_cl, "convertBasicArguments", "(IZIJJJ)[Ljava/lang/Object;");
jni_env->executor_convert_array_argument_id = env->GetMethodID(
jni_env->executor_cl, "convertArrayArguments", "(IZIJJJJJ)[Ljava/lang/Object;");
jni_env->executor_convert_map_argument_id = env->GetMethodID(
jni_env->executor_cl, "convertMapArguments", "(IZIJJJJJJJJ)[Ljava/lang/Object;");
jni_env->executor_result_basic_batch_id = env->GetMethodID(
jni_env->executor_cl, "copyBatchBasicResult", "(ZI[Ljava/lang/Object;JJJ)V");
jni_env->executor_result_array_batch_id = env->GetMethodID(
jni_env->executor_cl, "copyBatchArrayResult", "(ZI[Ljava/lang/Object;JJJJJ)V");
jni_env->executor_result_map_batch_id = env->GetMethodID(
jni_env->executor_cl, "copyBatchMapResult", "(ZI[Ljava/lang/Object;JJJJJJJJ)V");
jni_env->executor_evaluate_id =
env->GetMethodID(jni_env->executor_cl, "evaluate", EXECUTOR_EVALUATE_SIGNATURE);
jni_env->executor_close_id =
env->GetMethodID(jni_env->executor_cl, "close", EXECUTOR_CLOSE_SIGNATURE);
RETURN_ERROR_IF_EXC(env);
@ -132,288 +120,29 @@ Status JavaFunctionCall::execute_impl(FunctionContext* context, Block& block,
context->get_function_state(FunctionContext::THREAD_LOCAL));
JniEnv* jni_env =
reinterpret_cast<JniEnv*>(context->get_function_state(FunctionContext::FRAGMENT_LOCAL));
int arg_size = arguments.size();
ColumnPtr data_cols[arg_size];
ColumnPtr null_cols[arg_size];
jclass obj_class = env->FindClass("[Ljava/lang/Object;");
jclass arraylist_class = env->FindClass("Ljava/util/ArrayList;");
jclass hashmap_class = env->FindClass("Ljava/util/HashMap;");
jobjectArray arg_objects = env->NewObjectArray(arg_size, obj_class, nullptr);
int64_t nullmap_address = 0;
for (size_t arg_idx = 0; arg_idx < arg_size; ++arg_idx) {
bool arg_column_nullable = false;
// get argument column and type
ColumnWithTypeAndName& column = block.get_by_position(arguments[arg_idx]);
auto column_type = column.type;
data_cols[arg_idx] = column.column->convert_to_full_column_if_const();
// check type
DCHECK(_argument_types[arg_idx]->equals(*column_type))
<< " input column's type is " + column_type->get_name()
<< " does not equal to required type " << _argument_types[arg_idx]->get_name();
// get argument null map and nested column
if (auto* nullable = check_and_get_column<const ColumnNullable>(*data_cols[arg_idx])) {
arg_column_nullable = true;
column_type = remove_nullable(column_type);
null_cols[arg_idx] = nullable->get_null_map_column_ptr();
data_cols[arg_idx] = nullable->get_nested_column_ptr();
nullmap_address = reinterpret_cast<int64_t>(
check_and_get_column<ColumnVector<UInt8>>(null_cols[arg_idx])
->get_data()
.data());
}
// convert argument column data into java type
jobjectArray arr_obj = nullptr;
if (data_cols[arg_idx]->is_numeric() || data_cols[arg_idx]->is_column_decimal()) {
arr_obj = (jobjectArray)env->CallNonvirtualObjectMethod(
jni_ctx->executor, jni_env->executor_cl,
jni_env->executor_convert_basic_argument_id, arg_idx, arg_column_nullable,
num_rows, nullmap_address,
reinterpret_cast<int64_t>(data_cols[arg_idx]->get_raw_data().data), 0);
} else if (data_cols[arg_idx]->is_column_string()) {
const ColumnString* str_col =
assert_cast<const ColumnString*>(data_cols[arg_idx].get());
arr_obj = (jobjectArray)env->CallNonvirtualObjectMethod(
jni_ctx->executor, jni_env->executor_cl,
jni_env->executor_convert_basic_argument_id, arg_idx, arg_column_nullable,
num_rows, nullmap_address,
reinterpret_cast<int64_t>(str_col->get_chars().data()),
reinterpret_cast<int64_t>(str_col->get_offsets().data()));
} else if (data_cols[arg_idx]->is_column_array()) {
const ColumnArray* array_col =
assert_cast<const ColumnArray*>(data_cols[arg_idx].get());
const ColumnNullable& array_nested_nullable =
assert_cast<const ColumnNullable&>(array_col->get_data());
auto data_column_null_map = array_nested_nullable.get_null_map_column_ptr();
auto data_column = array_nested_nullable.get_nested_column_ptr();
auto offset_address =
reinterpret_cast<int64_t>(array_col->get_offsets_column().get_raw_data().data);
auto nested_nullmap_address = reinterpret_cast<int64_t>(
check_and_get_column<ColumnVector<UInt8>>(data_column_null_map)
->get_data()
.data());
int64_t nested_data_address = 0, nested_offset_address = 0;
// array type need pass address: [nullmap_address], offset_address, nested_nullmap_address, nested_data_address/nested_char_address,nested_offset_address
if (data_column->is_column_string()) {
const ColumnString* col = assert_cast<const ColumnString*>(data_column.get());
nested_data_address = reinterpret_cast<int64_t>(col->get_chars().data());
nested_offset_address = reinterpret_cast<int64_t>(col->get_offsets().data());
} else {
nested_data_address = reinterpret_cast<int64_t>(data_column->get_raw_data().data);
}
arr_obj = (jobjectArray)env->CallNonvirtualObjectMethod(
jni_ctx->executor, jni_env->executor_cl,
jni_env->executor_convert_array_argument_id, arg_idx, arg_column_nullable,
num_rows, nullmap_address, offset_address, nested_nullmap_address,
nested_data_address, nested_offset_address);
} else if (data_cols[arg_idx]->is_column_map()) {
const ColumnMap* map_col = assert_cast<const ColumnMap*>(data_cols[arg_idx].get());
auto offset_address =
reinterpret_cast<int64_t>(map_col->get_offsets_column().get_raw_data().data);
const ColumnNullable& map_key_column_nullable =
assert_cast<const ColumnNullable&>(map_col->get_keys());
auto key_data_column_null_map = map_key_column_nullable.get_null_map_column_ptr();
auto key_data_column = map_key_column_nullable.get_nested_column_ptr();
auto key_nested_nullmap_address = reinterpret_cast<int64_t>(
check_and_get_column<ColumnVector<UInt8>>(key_data_column_null_map)
->get_data()
.data());
int64_t key_nested_data_address = 0, key_nested_offset_address = 0;
if (key_data_column->is_column_string()) {
const ColumnString* col = assert_cast<const ColumnString*>(key_data_column.get());
key_nested_data_address = reinterpret_cast<int64_t>(col->get_chars().data());
key_nested_offset_address = reinterpret_cast<int64_t>(col->get_offsets().data());
} else {
key_nested_data_address =
reinterpret_cast<int64_t>(key_data_column->get_raw_data().data);
}
const ColumnNullable& map_value_column_nullable =
assert_cast<const ColumnNullable&>(map_col->get_values());
auto value_data_column_null_map = map_value_column_nullable.get_null_map_column_ptr();
auto value_data_column = map_value_column_nullable.get_nested_column_ptr();
auto value_nested_nullmap_address = reinterpret_cast<int64_t>(
check_and_get_column<ColumnVector<UInt8>>(value_data_column_null_map)
->get_data()
.data());
int64_t value_nested_data_address = 0, value_nested_offset_address = 0;
if (value_data_column->is_column_string()) {
const ColumnString* col = assert_cast<const ColumnString*>(value_data_column.get());
value_nested_data_address = reinterpret_cast<int64_t>(col->get_chars().data());
value_nested_offset_address = reinterpret_cast<int64_t>(col->get_offsets().data());
} else {
value_nested_data_address =
reinterpret_cast<int64_t>(value_data_column->get_raw_data().data);
}
arr_obj = (jobjectArray)env->CallNonvirtualObjectMethod(
jni_ctx->executor, jni_env->executor_cl,
jni_env->executor_convert_map_argument_id, arg_idx, arg_column_nullable,
num_rows, nullmap_address, offset_address, key_nested_nullmap_address,
key_nested_data_address, key_nested_offset_address,
value_nested_nullmap_address, value_nested_data_address,
value_nested_offset_address);
} else {
return Status::InvalidArgument(
strings::Substitute("Java UDF doesn't support type $0 now !",
_argument_types[arg_idx]->get_name()));
}
env->SetObjectArrayElement(arg_objects, arg_idx, arr_obj);
env->DeleteLocalRef(arr_obj);
}
std::unique_ptr<long[]> input_table;
RETURN_IF_ERROR(JniConnector::to_java_table(&block, num_rows, arguments, input_table));
auto input_table_schema = JniConnector::parse_table_schema(&block, arguments, true);
std::map<String, String> input_params = {
{"meta_address", std::to_string((long)input_table.get())},
{"required_fields", input_table_schema.first},
{"columns_types", input_table_schema.second}};
jobject input_map = JniUtil::convert_to_java_map(env, input_params);
auto output_table_schema = JniConnector::parse_table_schema(&block, {result}, true);
std::string output_nullable =
block.get_by_position(result).type->is_nullable() ? "true" : "false";
std::map<String, String> output_params = {{"is_nullable", output_nullable},
{"required_fields", output_table_schema.first},
{"columns_types", output_table_schema.second}};
jobject output_map = JniUtil::convert_to_java_map(env, output_params);
long output_address = env->CallLongMethod(jni_ctx->executor, jni_env->executor_evaluate_id,
input_map, output_map);
env->DeleteLocalRef(input_map);
env->DeleteLocalRef(output_map);
RETURN_IF_ERROR(JniUtil::GetJniExceptionMsg(env));
// evaluate with argument object
jobjectArray result_obj = (jobjectArray)env->CallNonvirtualObjectMethod(
jni_ctx->executor, jni_env->executor_cl, jni_env->executor_evaluate_id, num_rows,
arg_objects);
env->DeleteLocalRef(arg_objects);
RETURN_IF_ERROR(JniUtil::GetJniExceptionMsg(env));
auto return_type = block.get_data_type(result);
bool result_nullable = return_type->is_nullable();
ColumnUInt8::MutablePtr null_col = nullptr;
if (result_nullable) {
return_type = remove_nullable(return_type);
null_col = ColumnUInt8::create(num_rows, 0);
memset(null_col->get_data().data(), 0, num_rows);
nullmap_address = reinterpret_cast<int64_t>(null_col->get_data().data());
}
auto res_col = return_type->create_column();
res_col->resize(num_rows);
//could resize for column firstly, copy batch result into column
if (res_col->is_numeric() || res_col->is_column_decimal()) {
env->CallNonvirtualVoidMethod(jni_ctx->executor, jni_env->executor_cl,
jni_env->executor_result_basic_batch_id, result_nullable,
num_rows, result_obj, nullmap_address,
reinterpret_cast<int64_t>(res_col->get_raw_data().data), 0);
} else if (res_col->is_column_string()) {
const ColumnString* str_col = assert_cast<const ColumnString*>(res_col.get());
ColumnString::Chars& chars = const_cast<ColumnString::Chars&>(str_col->get_chars());
ColumnString::Offsets& offsets = const_cast<ColumnString::Offsets&>(str_col->get_offsets());
env->CallNonvirtualVoidMethod(
jni_ctx->executor, jni_env->executor_cl, jni_env->executor_result_basic_batch_id,
result_nullable, num_rows, result_obj, nullmap_address,
reinterpret_cast<int64_t>(&chars), reinterpret_cast<int64_t>(offsets.data()));
} else if (res_col->is_column_array()) {
ColumnArray* array_col = assert_cast<ColumnArray*>(res_col.get());
ColumnNullable& array_nested_nullable = assert_cast<ColumnNullable&>(array_col->get_data());
auto data_column_null_map = array_nested_nullable.get_null_map_column_ptr();
auto data_column = array_nested_nullable.get_nested_column_ptr();
auto& offset_column = array_col->get_offsets_column();
auto offset_address = reinterpret_cast<int64_t>(offset_column.get_raw_data().data);
auto& null_map_data =
assert_cast<ColumnVector<UInt8>*>(data_column_null_map.get())->get_data();
auto nested_nullmap_address = reinterpret_cast<int64_t>(null_map_data.data());
jmethodID list_size = env->GetMethodID(arraylist_class, "size", "()I");
int element_size = 0; // get all element size in num_rows of array column
for (int i = 0; i < num_rows; ++i) {
jobject obj = env->GetObjectArrayElement(result_obj, i);
if (obj == nullptr) {
continue;
}
element_size = element_size + env->CallIntMethod(obj, list_size);
env->DeleteLocalRef(obj);
}
array_nested_nullable.resize(element_size);
memset(null_map_data.data(), 0, element_size);
int64_t nested_data_address = 0, nested_offset_address = 0;
// array type need pass address: [nullmap_address], offset_address, nested_nullmap_address, nested_data_address/nested_char_address,nested_offset_address
if (data_column->is_column_string()) {
ColumnString* str_col = assert_cast<ColumnString*>(data_column.get());
ColumnString::Chars& chars = assert_cast<ColumnString::Chars&>(str_col->get_chars());
ColumnString::Offsets& offsets =
assert_cast<ColumnString::Offsets&>(str_col->get_offsets());
nested_data_address = reinterpret_cast<int64_t>(&chars);
nested_offset_address = reinterpret_cast<int64_t>(offsets.data());
} else {
nested_data_address = reinterpret_cast<int64_t>(data_column->get_raw_data().data);
}
env->CallNonvirtualVoidMethod(
jni_ctx->executor, jni_env->executor_cl, jni_env->executor_result_array_batch_id,
result_nullable, num_rows, result_obj, nullmap_address, offset_address,
nested_nullmap_address, nested_data_address, nested_offset_address);
} else if (res_col->is_column_map()) {
ColumnMap* map_col = assert_cast<ColumnMap*>(res_col.get());
auto& offset_column = map_col->get_offsets_column();
auto offset_address = reinterpret_cast<int64_t>(offset_column.get_raw_data().data);
ColumnNullable& map_key_column_nullable = assert_cast<ColumnNullable&>(map_col->get_keys());
auto key_data_column_null_map = map_key_column_nullable.get_null_map_column_ptr();
auto key_data_column = map_key_column_nullable.get_nested_column_ptr();
auto& key_null_map_data =
assert_cast<ColumnVector<UInt8>*>(key_data_column_null_map.get())->get_data();
auto key_nested_nullmap_address = reinterpret_cast<int64_t>(key_null_map_data.data());
ColumnNullable& map_value_column_nullable =
assert_cast<ColumnNullable&>(map_col->get_values());
auto value_data_column_null_map = map_value_column_nullable.get_null_map_column_ptr();
auto value_data_column = map_value_column_nullable.get_nested_column_ptr();
auto& value_null_map_data =
assert_cast<ColumnVector<UInt8>*>(value_data_column_null_map.get())->get_data();
auto value_nested_nullmap_address = reinterpret_cast<int64_t>(value_null_map_data.data());
jmethodID map_size = env->GetMethodID(hashmap_class, "size", "()I");
int element_size = 0; // get all element size in num_rows of map column
for (int i = 0; i < num_rows; ++i) {
jobject obj = env->GetObjectArrayElement(result_obj, i);
if (obj == nullptr) {
continue;
}
element_size = element_size + env->CallIntMethod(obj, map_size);
env->DeleteLocalRef(obj);
}
map_key_column_nullable.resize(element_size);
memset(key_null_map_data.data(), 0, element_size);
map_value_column_nullable.resize(element_size);
memset(value_null_map_data.data(), 0, element_size);
int64_t key_nested_data_address = 0, key_nested_offset_address = 0;
if (key_data_column->is_column_string()) {
ColumnString* str_col = assert_cast<ColumnString*>(key_data_column.get());
ColumnString::Chars& chars = assert_cast<ColumnString::Chars&>(str_col->get_chars());
ColumnString::Offsets& offsets =
assert_cast<ColumnString::Offsets&>(str_col->get_offsets());
key_nested_data_address = reinterpret_cast<int64_t>(&chars);
key_nested_offset_address = reinterpret_cast<int64_t>(offsets.data());
} else {
key_nested_data_address =
reinterpret_cast<int64_t>(key_data_column->get_raw_data().data);
}
int64_t value_nested_data_address = 0, value_nested_offset_address = 0;
if (value_data_column->is_column_string()) {
ColumnString* str_col = assert_cast<ColumnString*>(value_data_column.get());
ColumnString::Chars& chars = assert_cast<ColumnString::Chars&>(str_col->get_chars());
ColumnString::Offsets& offsets =
assert_cast<ColumnString::Offsets&>(str_col->get_offsets());
value_nested_data_address = reinterpret_cast<int64_t>(&chars);
value_nested_offset_address = reinterpret_cast<int64_t>(offsets.data());
} else {
value_nested_data_address =
reinterpret_cast<int64_t>(value_data_column->get_raw_data().data);
}
env->CallNonvirtualVoidMethod(jni_ctx->executor, jni_env->executor_cl,
jni_env->executor_result_map_batch_id, result_nullable,
num_rows, result_obj, nullmap_address, offset_address,
key_nested_nullmap_address, key_nested_data_address,
key_nested_offset_address, value_nested_nullmap_address,
value_nested_data_address, value_nested_offset_address);
} else {
return Status::InvalidArgument(strings::Substitute(
"Java UDF doesn't support return type $0 now !", return_type->get_name()));
}
env->DeleteLocalRef(result_obj);
env->DeleteLocalRef(obj_class);
env->DeleteLocalRef(arraylist_class);
env->DeleteLocalRef(hashmap_class);
if (result_nullable) {
block.replace_by_position(result,
ColumnNullable::create(std::move(res_col), std::move(null_col)));
} else {
block.replace_by_position(result, std::move(res_col));
}
return JniUtil::GetJniExceptionMsg(env);
return JniConnector::fill_block(&block, {result}, output_address);
}
Status JavaFunctionCall::close(FunctionContext* context,