// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #include "vec/json/parse2column.h" #include #include #include #include #include // IWYU pragma: keep #include #include #include #include #include #include #include #include #include #include #include "common/config.h" #include "common/status.h" #include "vec/columns/column.h" #include "vec/columns/column_object.h" #include "vec/columns/column_string.h" #include "vec/common/assert_cast.h" #include "vec/common/field_visitors.h" #include "vec/common/schema_util.h" #include "vec/common/string_ref.h" #include "vec/core/field.h" #include "vec/data_types/data_type.h" #include "vec/json/json_parser.h" #include "vec/json/path_in_data.h" #include "vec/json/simd_json_parser.h" namespace doris::vectorized { /** Pool for objects that cannot be used from different threads simultaneously. * Allows to create an object for each thread. * Pool has unbounded size and objects are not destroyed before destruction of pool. * * Use it in cases when thread local storage is not appropriate * (when maximum number of simultaneously used objects is less * than number of running/sleeping threads, that has ever used object, * and creation/destruction of objects is expensive). */ template class SimpleObjectPool { protected: /// Hold all available objects in stack. std::mutex mutex; std::stack> stack; /// Specialized deleter for std::unique_ptr. /// Returns underlying pointer back to stack thus reclaiming its ownership. struct Deleter { SimpleObjectPool* parent; Deleter(SimpleObjectPool* parent_ = nullptr) : parent {parent_} {} /// NOLINT void operator()(T* owning_ptr) const { std::lock_guard lock {parent->mutex}; parent->stack.emplace(owning_ptr); } }; public: using Pointer = std::unique_ptr; /// Extracts and returns a pointer from the stack if it's not empty, /// creates a new one by calling provided f() otherwise. template Pointer get(Factory&& f) { std::unique_lock lock(mutex); if (stack.empty()) { return {f(), this}; } auto object = stack.top().release(); stack.pop(); return std::unique_ptr(object, Deleter(this)); } /// Like get(), but creates object using default constructor. Pointer getDefault() { return get([] { return new T; }); } }; SimpleObjectPool> parsers_pool; using Node = typename ColumnObject::Subcolumns::Node; /// Visitor that keeps @num_dimensions_to_keep dimensions in arrays /// and replaces all scalars or nested arrays to @replacement at that level. class FieldVisitorReplaceScalars : public StaticVisitor { public: FieldVisitorReplaceScalars(const Field& replacement_, size_t num_dimensions_to_keep_) : replacement(replacement_), num_dimensions_to_keep(num_dimensions_to_keep_) {} template Field operator()(const T& x) const { if constexpr (std::is_same_v) { if (num_dimensions_to_keep == 0) { return replacement; } const size_t size = x.size(); Array res(size); for (size_t i = 0; i < size; ++i) { res[i] = apply_visitor( FieldVisitorReplaceScalars(replacement, num_dimensions_to_keep - 1), x[i]); } return res; } else { return replacement; } } private: const Field& replacement; size_t num_dimensions_to_keep; }; /// Finds a subcolumn from the same Nested type as @entry and inserts /// an array with default values with consistent sizes as in Nested type. bool try_insert_default_from_nested(const std::shared_ptr& entry, const ColumnObject::Subcolumns& subcolumns) { if (!entry->path.has_nested_part()) { return false; } const Node* current_node = subcolumns.find_leaf(entry->path); const Node* leaf = nullptr; size_t num_skipped_nested = 0; while (current_node) { /// Try to find the first Nested up to the current node. const auto* node_nested = ColumnObject::Subcolumns::find_parent( current_node, [](const auto& candidate) { return candidate.is_nested(); }); if (!node_nested) { break; } /// If there are no leaves, skip current node and find /// the next node up to the current. leaf = ColumnObject::Subcolumns::find_leaf(node_nested, [&](const auto& candidate) { return candidate.data.size() == entry->data.size() + 1; }); if (leaf) { break; } current_node = node_nested->parent; ++num_skipped_nested; } if (!leaf) { return false; } auto last_field = leaf->data.get_last_field(); if (last_field.is_null()) { return false; } const auto& least_common_type = entry->data.get_least_common_type(); size_t num_dimensions = schema_util::get_number_of_dimensions(*least_common_type); assert(num_skipped_nested < num_dimensions); /// Replace scalars to default values with consistent array sizes. size_t num_dimensions_to_keep = num_dimensions - num_skipped_nested; auto default_scalar = num_skipped_nested ? schema_util::create_empty_array_field(num_skipped_nested) : schema_util::get_base_type_of_array(least_common_type)->get_default(); auto default_field = apply_visitor( FieldVisitorReplaceScalars(default_scalar, num_dimensions_to_keep), last_field); entry->data.insert(std::move(default_field)); return true; } template Status parse_json_to_variant(IColumn& column, const char* src, size_t length, JSONDataParser* parser) { auto& column_object = assert_cast(column); std::optional result; /// Treat empty string as an empty object /// for better CAST from String to Object. if (length > 0) { result = parser->parse(src, length); } else { result = ParseResult {}; } if (!result) { LOG(INFO) << "failed to parse " << std::string_view(src, length) << ", length= " << length; return Status::DataQualityError( fmt::format("Failed to parse object {}", std::string_view(src, length))); } auto& [paths, values] = *result; assert(paths.size() == values.size()); phmap::flat_hash_set paths_set; size_t num_rows = column_object.size(); for (size_t i = 0; i < paths.size(); ++i) { FieldInfo field_info; RETURN_IF_ERROR(get_field_info(values[i], &field_info)); // TODO support multi dimensions array if (!config::enable_parse_multi_dimession_array && field_info.num_dimensions >= 2) { return Status::DataQualityError( "Sorry multi dimensions array is not supported now, we are working on it"); } if (is_nothing(field_info.scalar_type)) { continue; } if (!paths_set.insert(paths[i].get_path()).second) { return Status::DataQualityError( fmt::format("Object has ambiguous path {}, {}", paths[i].get_path())); } if (!column_object.has_subcolumn(paths[i])) { if (paths[i].has_nested_part()) { column_object.add_nested_subcolumn(paths[i], field_info, num_rows); } else { column_object.add_sub_column(paths[i], num_rows); } } auto* subcolumn = column_object.get_subcolumn(paths[i]); if (!subcolumn) { return Status::DataQualityError( fmt::format("Failed to find sub column {}", paths[i].get_path())); } assert(subcolumn->size() == num_rows); Status st = subcolumn->insert(std::move(values[i]), std::move(field_info)); if (st.is_invalid_argument()) { return Status::DataQualityError( fmt::format("Failed to insert field {}", st.to_string())); } RETURN_IF_ERROR(st); } // /// Insert default values to missed subcolumns. const auto& subcolumns = column_object.get_subcolumns(); for (const auto& entry : subcolumns) { if (!paths_set.contains(entry->path.get_path())) { bool inserted = try_insert_default_from_nested(entry, subcolumns); if (!inserted) { entry->data.insertDefault(); } } } column_object.incr_num_rows(); return Status::OK(); } bool extract_key(MutableColumns& columns, StringRef json, const std::vector& keys, const std::vector& types, JSONDataParser* parser) { return parser->extract_key(columns, json, keys, types); } // exposed interfaces Status parse_json_to_variant(IColumn& column, const StringRef& json, JSONDataParser* parser) { return parse_json_to_variant(column, json.data, json.size, parser); } Status parse_json_to_variant(IColumn& column, const std::vector& jsons) { auto parser = parsers_pool.get([] { return new JSONDataParser(); }); for (StringRef str : jsons) { RETURN_IF_ERROR(parse_json_to_variant(column, str.data, str.size, parser.get())); } return Status::OK(); } bool extract_key(MutableColumns& columns, const std::vector& jsons, const std::vector& keys, const std::vector& types) { auto parser = parsers_pool.get([] { return new JSONDataParser(); }); for (StringRef json : jsons) { if (!extract_key(columns, json, keys, types, parser.get())) { return false; } } return true; } bool extract_key(MutableColumns& columns, const ColumnString& json_column, const std::vector& keys, const std::vector& types) { auto parser = parsers_pool.get([] { return new JSONDataParser(); }); for (size_t x = 0; x < json_column.size(); ++x) { if (!extract_key(columns, json_column.get_data_at(x), keys, types, parser.get())) { return false; } } return true; } } // namespace doris::vectorized