// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #pragma once #include #include "common/status.h" #include "exec/exec_node.h" #include "runtime/runtime_state.h" #include "vec/core/block.h" #include "vec/exec/vdata_gen_scan_node.h" #define OPERATOR_CODE_GENERATOR(NAME, SUBCLASS) \ NAME##Builder::NAME##Builder(int32_t id, ExecNode* exec_node) \ : OperatorBuilder(id, #NAME, exec_node) {} \ \ OperatorPtr NAME##Builder::build_operator() { return std::make_shared(this, _node); } \ \ NAME::NAME(OperatorBuilderBase* operator_builder, ExecNode* node) \ : SUBCLASS(operator_builder, node) {}; namespace doris::pipeline { // Result of source pull data, init state is DEPEND_ON_SOURCE enum class SourceState : uint8_t { DEPEND_ON_SOURCE = 0, // Operator has no more data in itself, needs to read from source. MORE_DATA = 1, // Still have data can read FINISHED = 2 }; enum class SinkState : uint8_t { SINK_IDLE = 0, // can send block to sink SINK_BUSY = 1, // sink buffer is full, should wait sink to send some block FINISHED = 2 }; //////////////// DO NOT USE THE UP State //////////////// class OperatorBuilderBase; class OperatorBase; using OperatorPtr = std::shared_ptr; using Operators = std::vector; using OperatorBuilderPtr = std::shared_ptr; using OperatorBuilders = std::vector; class OperatorBuilderBase { public: OperatorBuilderBase(int32_t id, const std::string& name) : _id(id), _name(name) {} virtual ~OperatorBuilderBase() = default; virtual OperatorPtr build_operator() = 0; virtual bool is_sink() const { return false; } virtual bool is_source() const { return false; } // create the object used by all operator virtual Status prepare(RuntimeState* state); // destory the object used by all operator virtual void close(RuntimeState* state); std::string get_name() const { return _name; } RuntimeState* runtime_state() { return _state; } virtual const RowDescriptor& row_desc() = 0; int32_t id() const { return _id; } protected: const int32_t _id; const std::string _name; RuntimeState* _state = nullptr; bool _is_closed = false; }; template class OperatorBuilder : public OperatorBuilderBase { public: OperatorBuilder(int32_t id, const std::string& name, ExecNode* exec_node = nullptr) : OperatorBuilderBase(id, name), _node(reinterpret_cast(exec_node)) {} ~OperatorBuilder() override = default; const RowDescriptor& row_desc() override { return _node->row_desc(); } NodeType* exec_node() const { return _node; } protected: NodeType* _node; }; template class DataSinkOperatorBuilder : public OperatorBuilderBase { public: DataSinkOperatorBuilder(int32_t id, const std::string& name, DataSink* sink = nullptr) : OperatorBuilderBase(id, name), _sink(reinterpret_cast(sink)) {} ~DataSinkOperatorBuilder() override = default; bool is_sink() const override { return true; } const RowDescriptor& row_desc() override { return _sink->row_desc(); } SinkType* exec_node() const { return _sink; } protected: SinkType* _sink; }; class OperatorBase { public: explicit OperatorBase(OperatorBuilderBase* operator_builder); virtual ~OperatorBase() = default; // After both sink and source need to know the cancel state. // do cancel work bool is_sink() const; bool is_source() const; // Only result sink and data stream sink need to impl the virtual function virtual Status init(const TDataSink& tsink) { return Status::OK(); }; // Do prepare some state of Operator virtual Status prepare(RuntimeState* state) = 0; // Like ExecNode,when pipeline task first time be scheduled, can't block // the pipeline should be open after dependencies is finish // Eg a -> c, b-> c, after a, b pipeline finish, c pipeline should call open // Now the pipeline only have one task, so the there is no performance bottleneck for the mechanism, // but if one pipeline have multi task to parallel work, need to rethink the logic // // Each operator should call alloc_resource() to prepare resource to do data compute. // if ExecNode split to sink and source operator, alloc_resource() should be called in sink operator virtual Status open(RuntimeState* state) = 0; // Release the resource, should not block the thread // // Each operator should call close_self() to release resource // if ExecNode split to sink and source operator, close_self() should be called in source operator virtual Status close(RuntimeState* state) = 0; Status set_child(OperatorPtr child) { if (is_source()) { return Status::InternalError("source can not has child."); } _child = std::move(child); return Status::OK(); } virtual bool can_read() { return false; } // for source virtual bool can_write() { return false; } // for sink // for pipeline virtual Status get_block(RuntimeState* runtime_state, vectorized::Block* block, SourceState& result_state) { return Status::OK(); }; // return can write continue virtual Status sink(RuntimeState* state, vectorized::Block* block, SourceState source_state) = 0; virtual Status finalize(RuntimeState* state) { std::stringstream error_msg; error_msg << " not a sink, can not finalize"; return Status::NotSupported(error_msg.str()); } // close be called // - Source: scan thread do not exist // - Sink: RPC do not be disposed // - else return false virtual bool is_pending_finish() const { return false; } // TODO: should we keep the function // virtual bool is_finished() = 0; bool is_closed() const { return _is_closed; } MemTracker* mem_tracker() const { return _mem_tracker.get(); } const OperatorBuilderBase* operator_builder() const { return _operator_builder; } const RowDescriptor& row_desc(); RuntimeProfile* runtime_profile() { return _runtime_profile.get(); } std::string debug_string() const; protected: std::unique_ptr _mem_tracker; OperatorBuilderBase* _operator_builder; // source has no child // if an operator is not source, it will get data from its child. OperatorPtr _child; std::unique_ptr _runtime_profile; // TODO pipeline Account for peak memory used by this operator RuntimeProfile::Counter* _memory_used_counter = nullptr; private: bool _is_closed = false; }; template class DataSinkOperator : public OperatorBase { public: using NodeType = std::remove_pointer_t().exec_node())>; DataSinkOperator(OperatorBuilderBase* builder, DataSink* sink) : OperatorBase(builder), _sink(reinterpret_cast(sink)) {}; ~DataSinkOperator() override = default; Status prepare(RuntimeState* state) override { RETURN_IF_ERROR(_sink->prepare(state)); _runtime_profile.reset(new RuntimeProfile(_operator_builder->get_name())); _sink->profile()->insert_child_head(_runtime_profile.get(), true); _mem_tracker = std::make_unique("Operator:" + _runtime_profile->name(), _runtime_profile.get()); return Status::OK(); } Status open(RuntimeState* state) override { SCOPED_TIMER(_runtime_profile->total_time_counter()); return _sink->open(state); } Status sink(RuntimeState* state, vectorized::Block* in_block, SourceState source_state) override { SCOPED_TIMER(_runtime_profile->total_time_counter()); if (!UNLIKELY(in_block)) { DCHECK(source_state == SourceState::FINISHED) << "block is null, eos should invoke in finalize."; return Status::OK(); } return _sink->send(state, in_block, source_state == SourceState::FINISHED); } Status close(RuntimeState* state) override { _fresh_exec_timer(_sink); return _sink->close(state, Status::OK()); } Status finalize(RuntimeState* state) override { return Status::OK(); } protected: void _fresh_exec_timer(NodeType* node) { node->profile()->total_time_counter()->update( _runtime_profile->total_time_counter()->value()); } NodeType* _sink; }; template class Operator : public OperatorBase { public: using NodeType = std::remove_pointer_t().exec_node())>; Operator(OperatorBuilderBase* builder, ExecNode* node) : OperatorBase(builder), _node(reinterpret_cast(node)) {}; ~Operator() override = default; Status prepare(RuntimeState* state) override { _runtime_profile.reset(new RuntimeProfile(_operator_builder->get_name())); _node->runtime_profile()->insert_child_head(_runtime_profile.get(), true); _mem_tracker = std::make_unique("Operator:" + _runtime_profile->name(), _runtime_profile.get()); _node->increase_ref(); return Status::OK(); } Status open(RuntimeState* state) override { SCOPED_TIMER(_runtime_profile->total_time_counter()); RETURN_IF_ERROR(_node->alloc_resource(state)); return Status::OK(); } Status sink(RuntimeState* state, vectorized::Block* in_block, SourceState source_state) override { SCOPED_TIMER(_runtime_profile->total_time_counter()); return _node->sink(state, in_block, source_state == SourceState::FINISHED); } Status close(RuntimeState* state) override { _fresh_exec_timer(_node); if (!_node->decrease_ref()) { _node->release_resource(state); } return Status::OK(); } Status get_block(RuntimeState* state, vectorized::Block* block, SourceState& source_state) override { SCOPED_TIMER(_runtime_profile->total_time_counter()); bool eos = false; RETURN_IF_ERROR(_node->pull(state, block, &eos)); source_state = eos ? SourceState::FINISHED : SourceState::DEPEND_ON_SOURCE; return Status::OK(); } Status finalize(RuntimeState* state) override { return Status::OK(); } bool can_read() override { return _node->can_read(); } protected: void _fresh_exec_timer(NodeType* node) { node->runtime_profile()->total_time_counter()->update( _runtime_profile->total_time_counter()->value()); } NodeType* _node; }; template class DataStateOperator : public Operator { public: using NodeType = std::remove_pointer_t().exec_node())>; DataStateOperator(OperatorBuilderBase* builder, ExecNode* node) : Operator(builder, node), _child_block(new vectorized::Block), _child_source_state(SourceState::DEPEND_ON_SOURCE) {}; virtual ~DataStateOperator() = default; Status get_block(RuntimeState* state, vectorized::Block* block, SourceState& source_state) override { auto& node = Operator::_node; auto& child = Operator::_child; if (node->need_more_input_data()) { RETURN_IF_ERROR(child->get_block(state, _child_block.get(), _child_source_state)); source_state = _child_source_state; if (_child_block->rows() == 0) { return Status::OK(); } node->push(state, _child_block.get(), source_state == SourceState::FINISHED); } bool eos = false; RETURN_IF_ERROR(node->pull(state, block, &eos)); if (eos) { source_state = SourceState::FINISHED; _child_block->clear_column_data(); } else if (!node->need_more_input_data()) { source_state = SourceState::MORE_DATA; } else { _child_block->clear_column_data(); } return Status::OK(); } protected: std::unique_ptr _child_block; SourceState _child_source_state; }; } // namespace doris::pipeline