[feature-wip](java-udf) support java UDF with fixed-length input and output (#8516)

This feature is propsoed in [DSIP-1](https://cwiki.apache.org/confluence/display/DORIS/DSIP-001%3A+Java+UDF). 
This PR support fixed-length input and output Java UDF. Phase I in DIP-1 is done after this PR.

To support Java UDF effeciently, I use no data copy in JNI call and all compute operations are off-heap in Java.
To achieve that, I use a UdfExecutor instead. 

For users, a UDF class must have a public evaluate method.
This commit is contained in:
Gabriel
2022-03-23 10:32:50 +08:00
committed by GitHub
parent 9f0b93e3c6
commit b89e4c7bba
27 changed files with 2832 additions and 22 deletions

View File

@ -28,6 +28,7 @@
#include "http/http_client.h"
#include "util/dynamic_util.h"
#include "util/file_utils.h"
#include "util/jni-util.h"
#include "util/md5.h"
#include "util/spinlock.h"
@ -37,8 +38,9 @@ static const int kLibShardNum = 128;
// function cache entry, store information for
struct UserFunctionCacheEntry {
UserFunctionCacheEntry(int64_t fid_, const std::string& checksum_, const std::string& lib_file_)
: function_id(fid_), checksum(checksum_), lib_file(lib_file_) {}
UserFunctionCacheEntry(int64_t fid_, const std::string& checksum_, const std::string& lib_file_,
LibType type)
: function_id(fid_), checksum(checksum_), lib_file(lib_file_), type(type) {}
~UserFunctionCacheEntry();
void ref() { _refs.fetch_add(1); }
@ -75,6 +77,8 @@ struct UserFunctionCacheEntry {
// from symbol_name to function pointer
std::unordered_map<std::string, void*> fptr_map;
LibType type;
private:
std::atomic<int> _refs{0};
};
@ -141,7 +145,7 @@ Status UserFunctionCache::_load_entry_from_lib(const std::string& dir, const std
}
// create a cache entry and put it into entry map
UserFunctionCacheEntry* entry =
new UserFunctionCacheEntry(function_id, checksum, dir + "/" + file);
new UserFunctionCacheEntry(function_id, checksum, dir + "/" + file, LibType::SO);
entry->is_downloaded = true;
entry->ref();
@ -199,7 +203,7 @@ Status UserFunctionCache::get_function_ptr(int64_t fid, const std::string& orig_
if (output_entry != nullptr && *output_entry != nullptr) {
entry = *output_entry;
} else {
RETURN_IF_ERROR(_get_cache_entry(fid, url, checksum, &entry));
RETURN_IF_ERROR(_get_cache_entry(fid, url, checksum, &entry, LibType::SO));
need_unref_entry = true;
}
@ -237,7 +241,8 @@ Status UserFunctionCache::get_function_ptr(int64_t fid, const std::string& orig_
Status UserFunctionCache::_get_cache_entry(int64_t fid, const std::string& url,
const std::string& checksum,
UserFunctionCacheEntry** output_entry) {
UserFunctionCacheEntry** output_entry,
LibType type) {
UserFunctionCacheEntry* entry = nullptr;
{
std::lock_guard<std::mutex> l(_cache_lock);
@ -245,7 +250,7 @@ Status UserFunctionCache::_get_cache_entry(int64_t fid, const std::string& url,
if (it != _entry_map.end()) {
entry = it->second;
} else {
entry = new UserFunctionCacheEntry(fid, checksum, _make_lib_file(fid, checksum));
entry = new UserFunctionCacheEntry(fid, checksum, _make_lib_file(fid, checksum, type), type);
entry->ref();
_entry_map.emplace(fid, entry);
@ -292,7 +297,14 @@ Status UserFunctionCache::_load_cache_entry(const std::string& url, UserFunction
RETURN_IF_ERROR(_download_lib(url, entry));
}
RETURN_IF_ERROR(_load_cache_entry_internal(entry));
if (entry->type == LibType::SO) {
RETURN_IF_ERROR(_load_cache_entry_internal(entry));
} else if (entry->type == LibType::JAR) {
RETURN_IF_ERROR(_add_to_classpath(entry));
} else {
return Status::InvalidArgument(
"Unsupported lib type! Make sure your lib type is one of 'so' and 'jar'!");
}
return Status::OK();
}
@ -356,10 +368,38 @@ Status UserFunctionCache::_load_cache_entry_internal(UserFunctionCacheEntry* ent
return Status::OK();
}
std::string UserFunctionCache::_make_lib_file(int64_t function_id, const std::string& checksum) {
Status UserFunctionCache::_add_to_classpath(UserFunctionCacheEntry* entry) {
#ifdef LIBJVM
const std::string path = "file://" + entry->lib_file;
LOG(INFO) << "Add jar " << path << " to classpath";
JNIEnv* env;
RETURN_IF_ERROR(JniUtil::GetJNIEnv(&env));
jclass class_class_loader = env->FindClass("java/lang/ClassLoader");
jmethodID method_get_system_class_loader =
env->GetStaticMethodID(class_class_loader, "getSystemClassLoader", "()Ljava/lang/ClassLoader;");
jobject class_loader = env->CallStaticObjectMethod(class_class_loader, method_get_system_class_loader);
jclass class_url_class_loader = env->FindClass("java/net/URLClassLoader");
jmethodID method_add_url = env->GetMethodID(class_url_class_loader, "addURL", "(Ljava/net/URL;)V");
jclass class_url = env->FindClass("java/net/URL");
jmethodID url_ctor = env->GetMethodID(class_url, "<init>", "(Ljava/lang/String;)V");
jobject urlInstance = env->NewObject(class_url, url_ctor, env->NewStringUTF(path.c_str()));
env->CallVoidMethod(class_loader, method_add_url, urlInstance);
return Status::OK();
#else
return Status::InternalError("No libjvm is found!");
#endif
}
std::string UserFunctionCache::_make_lib_file(int64_t function_id, const std::string& checksum,
LibType type) {
int shard = function_id % kLibShardNum;
std::stringstream ss;
ss << _lib_dir << '/' << shard << '/' << function_id << '.' << checksum << ".so";
ss << _lib_dir << '/' << shard << '/' << function_id << '.' << checksum;
if (type == LibType::JAR) {
ss << ".jar";
} else {
ss << ".so";
}
return ss.str();
}
@ -372,4 +412,12 @@ void UserFunctionCache::release_entry(UserFunctionCacheEntry* entry) {
}
}
Status UserFunctionCache::get_jarpath(int64_t fid, const std::string& url, const std::string& checksum,
std::string* libpath) {
UserFunctionCacheEntry* entry = nullptr;
RETURN_IF_ERROR(_get_cache_entry(fid, url, checksum, &entry, LibType::JAR));
*libpath = entry->lib_file;
return Status::OK();
}
} // namespace doris