Files
doris/be/src/vec/columns/subcolumn_tree.h
lihangyu 37d1519316 [WIP](dynamic-table) support dynamic schema table (#16335)
Issue Number: close #16351

Dynamic schema table is a special type of table, it's schema change with loading procedure.Now we implemented this feature mainly for semi-structure data such as JSON, since JSON is schema self-described we could extract schema info from the original documents and inference the final type infomation.This speical table could reduce manual schema change operation and easily import semi-structure data and extends it's schema automatically.
2023-02-11 13:37:50 +08:00

222 lines
7.2 KiB
C++

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This file is copied from
// https://github.com/ClickHouse/ClickHouse/blob/master/src/DataTypes/Serializations/SubcolumnsTree.h
// and modified by Doris
#pragma once
#include <vec/columns/column.h>
#include <vec/common/hash_table/hash_map.h>
#include <vec/data_types/data_type.h>
#include <vec/json/path_in_data.h>
namespace doris::vectorized {
/// Tree that represents paths in document
/// with additional data in nodes.
template <typename NodeData>
class SubcolumnsTree {
public:
struct Node {
enum Kind {
TUPLE,
NESTED,
SCALAR,
};
explicit Node(Kind kind_) : kind(kind_) {}
Node(Kind kind_, const NodeData& data_) : kind(kind_), data(data_) {}
Node(Kind kind_, const NodeData& data_, const PathInData& path_)
: kind(kind_), data(data_), path(path_) {}
Kind kind = TUPLE;
const Node* parent = nullptr;
Arena strings_pool;
HashMapWithStackMemory<StringRef, std::shared_ptr<Node>, StringRefHash, 4> children;
NodeData data;
PathInData path;
bool is_nested() const { return kind == NESTED; }
bool is_scalar() const { return kind == SCALAR; }
void add_child(std::string_view key, std::shared_ptr<Node> next_node) {
next_node->parent = this;
StringRef key_ref {strings_pool.insert(key.data(), key.length()), key.length()};
children[key_ref] = std::move(next_node);
}
};
using NodeKind = typename Node::Kind;
using NodePtr = std::shared_ptr<Node>;
/// Add a leaf without any data in other nodes.
bool add(const PathInData& path, const NodeData& leaf_data) {
return add(path, [&](NodeKind kind, bool exists) -> NodePtr {
if (exists) {
return nullptr;
}
if (kind == Node::SCALAR) {
return std::make_shared<Node>(kind, leaf_data, path);
}
return std::make_shared<Node>(kind);
});
}
/// Callback for creation of node. Receives kind of node and
/// flag, which is true if node already exists.
using NodeCreator = std::function<NodePtr(NodeKind, bool)>;
bool add(const PathInData& path, const NodeCreator& node_creator) {
const auto& parts = path.get_parts();
if (parts.empty()) {
return false;
}
if (!root) {
root = std::make_shared<Node>(Node::TUPLE);
}
Node* current_node = root.get();
for (size_t i = 0; i < parts.size() - 1; ++i) {
assert(current_node->kind != Node::SCALAR);
auto it = current_node->children.find(
StringRef {parts[i].key.data(), parts[i].key.size()});
if (it != current_node->children.end()) {
current_node = it->get_second().get();
node_creator(current_node->kind, true);
if (current_node->is_nested() != parts[i].is_nested) {
return false;
}
} else {
auto next_kind = parts[i].is_nested ? Node::NESTED : Node::TUPLE;
auto next_node = node_creator(next_kind, false);
current_node->add_child(String(parts[i].key), next_node);
current_node = next_node.get();
}
}
auto it = current_node->children.find(
StringRef {parts.back().key.data(), parts.back().key.size()});
if (it != current_node->children.end()) {
return false;
}
auto next_node = node_creator(Node::SCALAR, false);
current_node->add_child(String(parts.back().key), next_node);
leaves.push_back(std::move(next_node));
return true;
}
/// Find node that matches the path the best.
const Node* find_best_match(const PathInData& path) const { return find_impl(path, false); }
/// Find node that matches the path exactly.
const Node* find_exact(const PathInData& path) const { return find_impl(path, true); }
/// Find leaf by path.
const Node* find_leaf(const PathInData& path) const {
const auto* candidate = find_exact(path);
if (!candidate || !candidate->is_scalar()) {
return nullptr;
}
return candidate;
}
using NodePredicate = std::function<bool(const Node&)>;
/// Finds leaf that satisfies the predicate.
const Node* find_leaf(const NodePredicate& predicate) {
return find_leaf(root.get(), predicate);
}
static const Node* find_leaf(const Node* node, const NodePredicate& predicate) {
if (!node) {
return nullptr;
}
if (node->is_scalar()) {
return predicate(*node) ? node : nullptr;
}
for (auto it = node->children.begin(); it != node->children.end(); ++it) {
auto child = it->get_second();
if (const auto* leaf = find_leaf(child.get(), predicate)) {
return leaf;
}
}
return nullptr;
}
/// Find first parent node that satisfies the predicate.
static const Node* find_parent(const Node* node, const NodePredicate& predicate) {
while (node && !predicate(*node)) {
node = node->parent;
}
return node;
}
bool empty() const { return root == nullptr; }
size_t size() const { return leaves.size(); }
using Nodes = std::vector<NodePtr>;
const Nodes& get_leaves() const { return leaves; }
const Node* get_root() const { return root.get(); }
using iterator = typename Nodes::iterator;
using const_iterator = typename Nodes::const_iterator;
iterator begin() { return leaves.begin(); }
iterator end() { return leaves.end(); }
const_iterator begin() const { return leaves.begin(); }
const_iterator end() const { return leaves.end(); }
private:
const Node* find_impl(const PathInData& path, bool find_exact) const {
if (!root) {
return nullptr;
}
const auto& parts = path.get_parts();
const Node* current_node = root.get();
for (const auto& part : parts) {
auto it = current_node->children.find(StringRef {part.key.data(), part.key.size()});
if (it == current_node->children.end()) {
return find_exact ? nullptr : current_node;
}
current_node = it->get_second().get();
}
return current_node;
}
NodePtr root;
Nodes leaves;
};
} // namespace doris::vectorized