// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #include "exprs/json_functions.h" #include #include #include #include #include #include #include #include #include // IWYU pragma: keep #include #include #include #include #include #include #include #include "common/compiler_util.h" // IWYU pragma: keep #include "common/logging.h" namespace doris { // static const re2::RE2 JSON_PATTERN("^([a-zA-Z0-9_\\-\\:\\s#\\|\\.]*)(?:\\[([0-9]+)\\])?"); // json path cannot contains: ", [, ] static const re2::RE2 JSON_PATTERN("^([^\\\"\\[\\]]*)(?:\\[([0-9]+|\\*)\\])?"); rapidjson::Value* JsonFunctions::match_value(const std::vector& parsed_paths, rapidjson::Value* document, rapidjson::Document::AllocatorType& mem_allocator, bool is_insert_null) { rapidjson::Value* root = document; rapidjson::Value* array_obj = nullptr; for (int i = 1; i < parsed_paths.size(); i++) { VLOG_TRACE << "parsed_paths: " << parsed_paths[i].debug_string(); if (root == nullptr || root->IsNull()) { return nullptr; } if (UNLIKELY(!parsed_paths[i].is_valid)) { return nullptr; } const std::string& col = parsed_paths[i].key; int index = parsed_paths[i].idx; if (LIKELY(!col.empty())) { if (root->IsArray()) { array_obj = static_cast( mem_allocator.Malloc(sizeof(rapidjson::Value))); array_obj->SetArray(); bool is_null = true; // if array ,loop the array,find out all Objects,then find the results from the objects for (int j = 0; j < root->Size(); j++) { rapidjson::Value* json_elem = &((*root)[j]); if (json_elem->IsArray() || json_elem->IsNull()) { continue; } else { if (!json_elem->IsObject()) { continue; } if (!json_elem->HasMember(col.c_str())) { if (is_insert_null) { // not found item, then insert a null object. is_null = false; rapidjson::Value nullObject(rapidjson::kNullType); array_obj->PushBack(nullObject, mem_allocator); } continue; } rapidjson::Value* obj = &((*json_elem)[col.c_str()]); if (obj->IsArray()) { is_null = false; for (int k = 0; k < obj->Size(); k++) { array_obj->PushBack((*obj)[k], mem_allocator); } } else if (!obj->IsNull()) { is_null = false; array_obj->PushBack(*obj, mem_allocator); } } } root = is_null ? &(array_obj->SetNull()) : array_obj; } else if (root->IsObject()) { if (!root->HasMember(col.c_str())) { return nullptr; } else { root = &((*root)[col.c_str()]); } } else { // root is not a nested type, return nullptr return nullptr; } } if (UNLIKELY(index != -1)) { // judge the rapidjson:Value, which base the top's result, // if not array return nullptr;else get the index value from the array if (root->IsArray()) { if (root->IsNull()) { return nullptr; } else if (index == -2) { // [*] array_obj = static_cast( mem_allocator.Malloc(sizeof(rapidjson::Value))); array_obj->SetArray(); for (int j = 0; j < root->Size(); j++) { rapidjson::Value v; v.CopyFrom((*root)[j], mem_allocator); array_obj->PushBack(v, mem_allocator); } root = array_obj; } else if (index >= root->Size()) { return nullptr; } else { root = &((*root)[index]); } } else { return nullptr; } } } return root; } rapidjson::Value* JsonFunctions::get_json_array_from_parsed_json( const std::string& json_path, rapidjson::Value* document, rapidjson::Document::AllocatorType& mem_allocator, bool* wrap_explicitly) { std::vector vec; parse_json_paths(json_path, &vec); return get_json_array_from_parsed_json(vec, document, mem_allocator, wrap_explicitly); } rapidjson::Value* JsonFunctions::get_json_array_from_parsed_json( const std::vector& parsed_paths, rapidjson::Value* document, rapidjson::Document::AllocatorType& mem_allocator, bool* wrap_explicitly) { *wrap_explicitly = false; if (!parsed_paths[0].is_valid) { return nullptr; } if (parsed_paths.size() == 1) { // the json path is "$", just return entire document // wrapper an array rapidjson::Value* array_obj = nullptr; array_obj = static_cast(mem_allocator.Malloc(sizeof(rapidjson::Value))); array_obj->SetArray(); array_obj->PushBack(*document, mem_allocator); return array_obj; } rapidjson::Value* root = match_value(parsed_paths, document, mem_allocator, true); if (root == nullptr || root == document) { // not found return nullptr; } else if (!root->IsArray() && wrap_explicitly) { rapidjson::Value* array_obj = nullptr; array_obj = static_cast(mem_allocator.Malloc(sizeof(rapidjson::Value))); array_obj->SetArray(); rapidjson::Value copy; copy.CopyFrom(*root, mem_allocator); array_obj->PushBack(std::move(copy), mem_allocator); // set `wrap_explicitly` to true, so that the caller knows that this Array is wrapped actively. *wrap_explicitly = true; return array_obj; } return root; } rapidjson::Value* JsonFunctions::get_json_object_from_parsed_json( const std::vector& parsed_paths, rapidjson::Value* document, rapidjson::Document::AllocatorType& mem_allocator) { if (!parsed_paths[0].is_valid) { return nullptr; } if (parsed_paths.size() == 1) { // the json path is "$", just return entire document return document; } rapidjson::Value* root = match_value(parsed_paths, document, mem_allocator, true); if (root == nullptr || root == document) { // not found return nullptr; } return root; } void JsonFunctions::parse_json_paths(const std::string& path_string, std::vector* parsed_paths) { // split path by ".", and escape quota by "\" // eg: // '$.text#abc.xyz' -> [$, text#abc, xyz] // '$."text.abc".xyz' -> [$, text.abc, xyz] // '$."text.abc"[1].xyz' -> [$, text.abc[1], xyz] boost::tokenizer> tok( path_string, boost::escaped_list_separator("\\", ".", "\"")); std::vector paths(tok.begin(), tok.end()); get_parsed_paths(paths, parsed_paths); } void JsonFunctions::get_parsed_paths(const std::vector& path_exprs, std::vector* parsed_paths) { if (path_exprs.empty()) { return; } if (path_exprs[0] != "$") { parsed_paths->emplace_back("", -1, false); } else { parsed_paths->emplace_back("$", -1, true); } for (int i = 1; i < path_exprs.size(); i++) { std::string col; std::string index; if (UNLIKELY(!RE2::FullMatch(path_exprs[i], JSON_PATTERN, &col, &index))) { parsed_paths->emplace_back("", -1, false); } else { int idx = -1; if (!index.empty()) { if (index == "*") { idx = -2; } else { idx = atoi(index.c_str()); } } parsed_paths->emplace_back(std::move(col), idx, true); } } } Status JsonFunctions::extract_from_object(simdjson::ondemand::object& obj, const std::vector& jsonpath, simdjson::ondemand::value* value) noexcept { // Return DataQualityError when it's a malformed json. // Otherwise the path was not found, due to // 1. array out of bound // 2. not exist such field in object // 3. the input type is not object but could be null or other types and lead to simdjson::INCORRECT_TYPE #define HANDLE_SIMDJSON_ERROR(err, msg) \ do { \ const simdjson::error_code& _err = err; \ const std::string& _msg = msg; \ if (UNLIKELY(_err)) { \ if (_err == simdjson::NO_SUCH_FIELD || _err == simdjson::INDEX_OUT_OF_BOUNDS || \ _err == simdjson::INCORRECT_TYPE) { \ return Status::NotFound( \ fmt::format("Not found target filed, err: {}, msg: {}", \ simdjson::error_message(_err), _msg)); \ } \ return Status::DataQualityError( \ fmt::format("err: {}, msg: {}", simdjson::error_message(_err), _msg)); \ } \ } while (false); if (jsonpath.size() <= 1) { // The first elem of json path should be '$'. // A valid json path's size is >= 2. return Status::DataQualityError("empty json path"); } simdjson::ondemand::value tvalue; // Skip the first $. for (int i = 1; i < jsonpath.size(); i++) { if (UNLIKELY(!jsonpath[i].is_valid)) { return Status::DataQualityError(fmt::format("invalid json path: {}", jsonpath[i].key)); } const std::string& col = jsonpath[i].key; int index = jsonpath[i].idx; // Since the simdjson::ondemand::object cannot be converted to simdjson::ondemand::value, // we have to do some special treatment for the second elem of json path. // If the key is not found in json object, simdjson::NO_SUCH_FIELD would be returned. if (i == 1) { HANDLE_SIMDJSON_ERROR(obj.find_field_unordered(col).get(tvalue), fmt::format("unable to find field: {}", col)); } else { HANDLE_SIMDJSON_ERROR(tvalue.find_field_unordered(col).get(tvalue), fmt::format("unable to find field: {}", col)); } // TODO support [*] which idex == -2 if (index != -1) { // try to access tvalue as array. // If the index is beyond the length of array, simdjson::INDEX_OUT_OF_BOUNDS would be returned. simdjson::ondemand::array arr; HANDLE_SIMDJSON_ERROR(tvalue.get_array().get(arr), fmt::format("failed to access field as array, field: {}", col)); HANDLE_SIMDJSON_ERROR( arr.at(index).get(tvalue), fmt::format("failed to access array field: {}, index: {}", col, index)); } } std::swap(*value, tvalue); return Status::OK(); } std::string JsonFunctions::print_json_value(const rapidjson::Value& value) { rapidjson::StringBuffer buffer; buffer.Clear(); rapidjson::Writer writer(buffer); value.Accept(writer); return std::string(buffer.GetString()); } void JsonFunctions::merge_objects(rapidjson::Value& dst_object, rapidjson::Value& src_object, rapidjson::Document::AllocatorType& allocator) { if (!src_object.IsObject()) { return; } VLOG_DEBUG << "merge from src: " << print_json_value(src_object) << ", to: " << print_json_value(dst_object); for (auto src_it = src_object.MemberBegin(); src_it != src_object.MemberEnd(); ++src_it) { auto dst_it = dst_object.FindMember(src_it->name); if (dst_it != dst_object.MemberEnd()) { if (src_it->value.IsObject() && dst_it->value.IsObject()) { merge_objects(dst_it->value, src_it->value, allocator); } else { if (dst_it->value.IsNull()) { dst_it->value = src_it->value; } } } else { dst_object.AddMember(src_it->name, src_it->value, allocator); } } } // root path "$." bool JsonFunctions::is_root_path(const std::vector& json_path) { return json_path.size() == 2 && json_path[0].key == "$" && json_path[1].key.empty(); } } // namespace doris