Files
doris/be/src/exprs/json_functions.h
lihangyu 01383c3217 [Enhancement](stream-load-json) using simdjson to parse json (#11665)
Currently we use rapidjson to parse json document, It's fast but not fast enough compare to simdjson.And I found that the simdjson has a parsing front-end called simdjson::ondemand which will parse json when accessing fields and could strip the field token from the original document, using this feature we could reduce the cost of string copy(eg. we convert everthing to a string literal in _write_data_to_column by sprintf, I saw a hotspot from the flamegrame in this function, using simdjson::to_json_string will strip the token(a string piece) which is std::string_view and this is exactly we need).And second in _set_column_value we could iterate through the json document by for (auto field: object_val) {xxx}, this is much faster than looking up a field by it's field name like objectValue.FindMember("k1").The third optimization is the at_pointer interface simdjson provided, this could directly get the json field from original document.
2022-08-16 14:49:50 +08:00

164 lines
6.3 KiB
C++

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <fmt/core.h>
#include <rapidjson/document.h>
#include <sstream>
#include "udf/udf.h"
namespace doris {
enum JsonFunctionType {
JSON_FUN_INT = 0,
JSON_FUN_DOUBLE,
JSON_FUN_STRING,
JSON_FUN_UNKNOWN //The last
};
class Expr;
class OpcodeRegistry;
class TupleRow;
struct JsonPath {
std::string key; // key of a json object
int idx; // array index of a json array, -1 means not set, -2 means *
bool is_valid; // true if the path is successfully parsed
JsonPath(const std::string& key_, int idx_, bool is_valid_)
: key(key_), idx(idx_), is_valid(is_valid_) {}
JsonPath(std::string&& key_, int idx_, bool is_valid_)
: key(std::move(key_)), idx(idx_), is_valid(is_valid_) {}
std::string to_string() const {
std::stringstream ss;
if (!is_valid) {
return "INVALID";
}
if (!key.empty()) {
ss << key;
}
if (idx == -2) {
ss << "[*]";
} else if (idx > -1) {
ss << "[" << idx << "]";
}
return ss.str();
}
std::string to_simdjson_pointer(bool* valid) const {
std::stringstream ss;
if (!is_valid) {
*valid = false;
return "";
}
ss << "/";
if (!key.empty()) {
ss << key;
}
if (idx == -2) {
// not support [*]
*valid = false;
return "";
} else if (idx > -1) {
ss << "/" << idx;
}
return ss.str();
}
std::string debug_string() const {
return fmt::format("key:{}, idx:{}, valid:{}", key, idx, is_valid);
}
};
struct JsonState {
std::vector<JsonPath> json_paths;
rapidjson::Document document;
};
class JsonFunctions {
public:
static void init();
static doris_udf::IntVal get_json_int(doris_udf::FunctionContext* context,
const doris_udf::StringVal& json_str,
const doris_udf::StringVal& path);
static doris_udf::StringVal get_json_string(doris_udf::FunctionContext* context,
const doris_udf::StringVal& json_str,
const doris_udf::StringVal& path);
static doris_udf::DoubleVal get_json_double(doris_udf::FunctionContext* context,
const doris_udf::StringVal& json_str,
const doris_udf::StringVal& path);
static rapidjson::Value* get_json_object(FunctionContext* context,
const std::string_view& json_string,
const std::string_view& path_string,
const JsonFunctionType& fntype,
rapidjson::Document* document);
static doris_udf::StringVal json_array(doris_udf::FunctionContext* context, int num_args,
const doris_udf::StringVal* json_str);
static doris_udf::StringVal json_object(doris_udf::FunctionContext* context, int num_args,
const doris_udf::StringVal* json_str);
static doris_udf::StringVal json_quote(doris_udf::FunctionContext* context,
const doris_udf::StringVal& json_str);
/**
* The `document` parameter must be has parsed.
* return Value Is Array object
* wrap_explicitly is set to true when the returned Array is wrapped actively.
*/
static rapidjson::Value* get_json_array_from_parsed_json(
const std::vector<JsonPath>& parsed_paths, rapidjson::Value* document,
rapidjson::Document::AllocatorType& mem_allocator, bool* wrap_explicitly);
// this is only for test, it will parse the json path inside,
// so that we can easily pass a json path as string.
static rapidjson::Value* get_json_array_from_parsed_json(
const std::string& jsonpath, rapidjson::Value* document,
rapidjson::Document::AllocatorType& mem_allocator, bool* wrap_explicitly);
static rapidjson::Value* get_json_object_from_parsed_json(
const std::vector<JsonPath>& parsed_paths, rapidjson::Value* document,
rapidjson::Document::AllocatorType& mem_allocator);
static void json_path_prepare(doris_udf::FunctionContext*,
doris_udf::FunctionContext::FunctionStateScope);
static void json_path_close(doris_udf::FunctionContext*,
doris_udf::FunctionContext::FunctionStateScope);
static void parse_json_paths(const std::string& path_strings,
std::vector<JsonPath>* parsed_paths);
private:
static rapidjson::Value* match_value(const std::vector<JsonPath>& parsed_paths,
rapidjson::Value* document,
rapidjson::Document::AllocatorType& mem_allocator,
bool is_insert_null = false);
static void get_parsed_paths(const std::vector<std::string>& path_exprs,
std::vector<JsonPath>* parsed_paths);
static rapidjson::Value parse_str_with_flag(const StringVal& arg, const StringVal& flag,
const int num,
rapidjson::Document::AllocatorType& allocator);
};
} // namespace doris