Files
doris/be/src/pipeline/exec/operator.h
Pxl 48a9166aa4 [Pipeline](sink) support olap table sink operator (#14872)
* support olap table sink operator

* update config
2022-12-07 15:29:56 +08:00

386 lines
13 KiB
C++

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <utility>
#include "common/status.h"
#include "exec/exec_node.h"
#include "runtime/runtime_state.h"
#include "vec/core/block.h"
#include "vec/exec/vdata_gen_scan_node.h"
#define OPERATOR_CODE_GENERATOR(NAME, SUBCLASS) \
NAME##Builder::NAME##Builder(int32_t id, ExecNode* exec_node) \
: OperatorBuilder(id, #NAME, exec_node) {} \
\
OperatorPtr NAME##Builder::build_operator() { return std::make_shared<NAME>(this, _node); } \
\
NAME::NAME(OperatorBuilderBase* operator_builder, ExecNode* node) \
: SUBCLASS(operator_builder, node) {};
namespace doris::pipeline {
// Result of source pull data, init state is DEPEND_ON_SOURCE
enum class SourceState : uint8_t {
DEPEND_ON_SOURCE = 0, // Operator has no more data in itself, needs to read from source.
MORE_DATA = 1, // Still have data can read
FINISHED = 2
};
enum class SinkState : uint8_t {
SINK_IDLE = 0, // can send block to sink
SINK_BUSY = 1, // sink buffer is full, should wait sink to send some block
FINISHED = 2
};
//////////////// DO NOT USE THE UP State ////////////////
class OperatorBuilderBase;
class OperatorBase;
using OperatorPtr = std::shared_ptr<OperatorBase>;
using Operators = std::vector<OperatorPtr>;
using OperatorBuilderPtr = std::shared_ptr<OperatorBuilderBase>;
using OperatorBuilders = std::vector<OperatorBuilderPtr>;
class OperatorBuilderBase {
public:
OperatorBuilderBase(int32_t id, const std::string& name) : _id(id), _name(name) {}
virtual ~OperatorBuilderBase() = default;
virtual OperatorPtr build_operator() = 0;
virtual bool is_sink() const { return false; }
virtual bool is_source() const { return false; }
// create the object used by all operator
virtual Status prepare(RuntimeState* state);
// destory the object used by all operator
virtual void close(RuntimeState* state);
std::string get_name() const { return _name; }
RuntimeState* runtime_state() { return _state; }
virtual const RowDescriptor& row_desc() = 0;
int32_t id() const { return _id; }
protected:
const int32_t _id;
const std::string _name;
RuntimeState* _state = nullptr;
bool _is_closed = false;
};
template <typename NodeType>
class OperatorBuilder : public OperatorBuilderBase {
public:
OperatorBuilder(int32_t id, const std::string& name, ExecNode* exec_node = nullptr)
: OperatorBuilderBase(id, name), _node(reinterpret_cast<NodeType*>(exec_node)) {}
~OperatorBuilder() override = default;
const RowDescriptor& row_desc() override { return _node->row_desc(); }
NodeType* exec_node() const { return _node; }
protected:
NodeType* _node;
};
template <typename SinkType>
class DataSinkOperatorBuilder : public OperatorBuilderBase {
public:
DataSinkOperatorBuilder(int32_t id, const std::string& name, DataSink* sink = nullptr)
: OperatorBuilderBase(id, name), _sink(reinterpret_cast<SinkType*>(sink)) {}
~DataSinkOperatorBuilder() override = default;
bool is_sink() const override { return true; }
const RowDescriptor& row_desc() override { return _sink->row_desc(); }
SinkType* exec_node() const { return _sink; }
protected:
SinkType* _sink;
};
class OperatorBase {
public:
explicit OperatorBase(OperatorBuilderBase* operator_builder);
virtual ~OperatorBase() = default;
// After both sink and source need to know the cancel state.
// do cancel work
bool is_sink() const;
bool is_source() const;
// Only result sink and data stream sink need to impl the virtual function
virtual Status init(const TDataSink& tsink) { return Status::OK(); };
// Do prepare some state of Operator
virtual Status prepare(RuntimeState* state) = 0;
// Like ExecNode,when pipeline task first time be scheduled, can't block
// the pipeline should be open after dependencies is finish
// Eg a -> c, b-> c, after a, b pipeline finish, c pipeline should call open
// Now the pipeline only have one task, so the there is no performance bottleneck for the mechanism,
// but if one pipeline have multi task to parallel work, need to rethink the logic
//
// Each operator should call alloc_resource() to prepare resource to do data compute.
// if ExecNode split to sink and source operator, alloc_resource() should be called in sink operator
virtual Status open(RuntimeState* state) = 0;
// Release the resource, should not block the thread
//
// Each operator should call close_self() to release resource
// if ExecNode split to sink and source operator, close_self() should be called in source operator
virtual Status close(RuntimeState* state) = 0;
Status set_child(OperatorPtr child) {
if (is_source()) {
return Status::InternalError("source can not has child.");
}
_child = std::move(child);
return Status::OK();
}
virtual bool can_read() { return false; } // for source
virtual bool can_write() { return false; } // for sink
// for pipeline
virtual Status get_block(RuntimeState* runtime_state, vectorized::Block* block,
SourceState& result_state) {
return Status::OK();
};
// return can write continue
virtual Status sink(RuntimeState* state, vectorized::Block* block,
SourceState source_state) = 0;
virtual Status finalize(RuntimeState* state) {
std::stringstream error_msg;
error_msg << " not a sink, can not finalize";
return Status::NotSupported(error_msg.str());
}
// close be called
// - Source: scan thread do not exist
// - Sink: RPC do not be disposed
// - else return false
virtual bool is_pending_finish() const { return false; }
// TODO: should we keep the function
// virtual bool is_finished() = 0;
bool is_closed() const { return _is_closed; }
MemTracker* mem_tracker() const { return _mem_tracker.get(); }
const OperatorBuilderBase* operator_builder() const { return _operator_builder; }
const RowDescriptor& row_desc();
RuntimeProfile* runtime_profile() { return _runtime_profile.get(); }
std::string debug_string() const;
protected:
std::unique_ptr<MemTracker> _mem_tracker;
OperatorBuilderBase* _operator_builder;
// source has no child
// if an operator is not source, it will get data from its child.
OperatorPtr _child;
std::unique_ptr<RuntimeProfile> _runtime_profile;
// TODO pipeline Account for peak memory used by this operator
RuntimeProfile::Counter* _memory_used_counter = nullptr;
private:
bool _is_closed = false;
};
template <typename OperatorBuilderType>
class DataSinkOperator : public OperatorBase {
public:
using NodeType =
std::remove_pointer_t<decltype(std::declval<OperatorBuilderType>().exec_node())>;
DataSinkOperator(OperatorBuilderBase* builder, DataSink* sink)
: OperatorBase(builder), _sink(reinterpret_cast<NodeType*>(sink)) {};
~DataSinkOperator() override = default;
Status prepare(RuntimeState* state) override {
RETURN_IF_ERROR(_sink->prepare(state));
_runtime_profile.reset(new RuntimeProfile(_operator_builder->get_name()));
_sink->profile()->insert_child_head(_runtime_profile.get(), true);
_mem_tracker = std::make_unique<MemTracker>("Operator:" + _runtime_profile->name(),
_runtime_profile.get());
return Status::OK();
}
Status open(RuntimeState* state) override {
SCOPED_TIMER(_runtime_profile->total_time_counter());
return _sink->open(state);
}
Status sink(RuntimeState* state, vectorized::Block* in_block,
SourceState source_state) override {
SCOPED_TIMER(_runtime_profile->total_time_counter());
if (!UNLIKELY(in_block)) {
DCHECK(source_state == SourceState::FINISHED)
<< "block is null, eos should invoke in finalize.";
return Status::OK();
}
return _sink->send(state, in_block, source_state == SourceState::FINISHED);
}
Status close(RuntimeState* state) override {
_fresh_exec_timer(_sink);
return _sink->close(state, Status::OK());
}
Status finalize(RuntimeState* state) override { return Status::OK(); }
protected:
void _fresh_exec_timer(NodeType* node) {
node->profile()->total_time_counter()->update(
_runtime_profile->total_time_counter()->value());
}
NodeType* _sink;
};
template <typename OperatorBuilderType>
class Operator : public OperatorBase {
public:
using NodeType =
std::remove_pointer_t<decltype(std::declval<OperatorBuilderType>().exec_node())>;
Operator(OperatorBuilderBase* builder, ExecNode* node)
: OperatorBase(builder), _node(reinterpret_cast<NodeType*>(node)) {};
~Operator() override = default;
Status prepare(RuntimeState* state) override {
_runtime_profile.reset(new RuntimeProfile(_operator_builder->get_name()));
_node->runtime_profile()->insert_child_head(_runtime_profile.get(), true);
_mem_tracker = std::make_unique<MemTracker>("Operator:" + _runtime_profile->name(),
_runtime_profile.get());
_node->increase_ref();
return Status::OK();
}
Status open(RuntimeState* state) override {
SCOPED_TIMER(_runtime_profile->total_time_counter());
RETURN_IF_ERROR(_node->alloc_resource(state));
return Status::OK();
}
Status sink(RuntimeState* state, vectorized::Block* in_block,
SourceState source_state) override {
SCOPED_TIMER(_runtime_profile->total_time_counter());
return _node->sink(state, in_block, source_state == SourceState::FINISHED);
}
Status close(RuntimeState* state) override {
_fresh_exec_timer(_node);
if (!_node->decrease_ref()) {
_node->release_resource(state);
}
return Status::OK();
}
Status get_block(RuntimeState* state, vectorized::Block* block,
SourceState& source_state) override {
SCOPED_TIMER(_runtime_profile->total_time_counter());
bool eos = false;
RETURN_IF_ERROR(_node->pull(state, block, &eos));
source_state = eos ? SourceState::FINISHED : SourceState::DEPEND_ON_SOURCE;
return Status::OK();
}
Status finalize(RuntimeState* state) override { return Status::OK(); }
bool can_read() override { return _node->can_read(); }
protected:
void _fresh_exec_timer(NodeType* node) {
node->runtime_profile()->total_time_counter()->update(
_runtime_profile->total_time_counter()->value());
}
NodeType* _node;
};
template <typename OperatorBuilderType>
class DataStateOperator : public Operator<OperatorBuilderType> {
public:
using NodeType =
std::remove_pointer_t<decltype(std::declval<OperatorBuilderType>().exec_node())>;
DataStateOperator(OperatorBuilderBase* builder, ExecNode* node)
: Operator<OperatorBuilderType>(builder, node),
_child_block(new vectorized::Block),
_child_source_state(SourceState::DEPEND_ON_SOURCE) {};
virtual ~DataStateOperator() = default;
Status get_block(RuntimeState* state, vectorized::Block* block,
SourceState& source_state) override {
auto& node = Operator<OperatorBuilderType>::_node;
auto& child = Operator<OperatorBuilderType>::_child;
if (node->need_more_input_data()) {
RETURN_IF_ERROR(child->get_block(state, _child_block.get(), _child_source_state));
source_state = _child_source_state;
if (_child_block->rows() == 0) {
return Status::OK();
}
node->push(state, _child_block.get(), source_state == SourceState::FINISHED);
}
bool eos = false;
RETURN_IF_ERROR(node->pull(state, block, &eos));
if (eos) {
source_state = SourceState::FINISHED;
_child_block->clear_column_data();
} else if (!node->need_more_input_data()) {
source_state = SourceState::MORE_DATA;
} else {
_child_block->clear_column_data();
}
return Status::OK();
}
protected:
std::unique_ptr<vectorized::Block> _child_block;
SourceState _child_source_state;
};
} // namespace doris::pipeline