// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #ifndef DORIS_BE_SRC_QUERY_EXEC_EXEC_NODE_H #define DORIS_BE_SRC_QUERY_EXEC_EXEC_NODE_H #include #include #include #include "common/status.h" #include "gen_cpp/PlanNodes_types.h" #include "runtime/descriptors.h" #include "runtime/mem_pool.h" #include "util/runtime_profile.h" #include "util/blocking_queue.hpp" #include "runtime/bufferpool/buffer_pool.h" #include "runtime/query_statistics.h" #include "service/backend_options.h" #include "util/uid_util.h" // for print_id namespace doris { class Expr; class ExprContext; class ObjectPool; class Counters; class RowBatch; class RuntimeState; class TPlan; class TupleRow; class DataSink; class MemTracker; using std::string; using std::stringstream; using std::vector; using std::map; using boost::lock_guard; using boost::mutex; // Superclass of all executor nodes. // All subclasses need to make sure to check RuntimeState::is_cancelled() // periodically in order to ensure timely termination after the cancellation // flag gets set. class ExecNode { public: // Init conjuncts. ExecNode(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs); virtual ~ExecNode(); /// Initializes this object from the thrift tnode desc. The subclass should /// do any initialization that can fail in Init() rather than the ctor. /// If overridden in subclass, must first call superclass's Init(). virtual Status init(const TPlanNode& tnode, RuntimeState* state = nullptr); // Sets up internal structures, etc., without doing any actual work. // Must be called prior to open(). Will only be called once in this // node's lifetime. // All code generation (adding functions to the LlvmCodeGen object) must happen // in prepare(). Retrieving the jit compiled function pointer must happen in // open(). // If overridden in subclass, must first call superclass's prepare(). virtual Status prepare(RuntimeState* state); // Performs any preparatory work prior to calling get_next(). // Can be called repeatedly (after calls to close()). // Caller must not be holding any io buffers. This will cause deadlock. virtual Status open(RuntimeState* state); // Retrieves rows and returns them via row_batch. Sets eos to true // if subsequent calls will not retrieve any more rows. // Data referenced by any tuples returned in row_batch must not be overwritten // by the callee until close() is called. The memory holding that data // can be returned via row_batch's tuple_data_pool (in which case it may be deleted // by the caller) or held on to by the callee. The row_batch, including its // tuple_data_pool, will be destroyed by the caller at some point prior to the final // close() call. // In other words, if the memory holding the tuple data will be referenced // by the callee in subsequent get_next() calls, it must *not* be attached to the // row_batch's tuple_data_pool. // Caller must not be holding any io buffers. This will cause deadlock. // TODO: AggregationNode and HashJoinNode cannot be "re-opened" yet. virtual Status get_next(RuntimeState* state, RowBatch* row_batch, bool* eos) = 0; // Resets the stream of row batches to be retrieved by subsequent GetNext() calls. // Clears all internal state, returning this node to the state it was in after calling // Prepare() and before calling Open(). This function must not clear memory // still owned by this node that is backing rows returned in GetNext(). // Prepare() and Open() must have already been called before calling Reset(). // GetNext() may have optionally been called (not necessarily until eos). // Close() must not have been called. // Reset() is not idempotent. Calling it multiple times in a row without a preceding // call to Open() is invalid. // If overridden in a subclass, must call superclass's Reset() at the end. The default // implementation calls Reset() on children. // Note that this function may be called many times (proportional to the input data), // so should be fast. virtual Status reset(RuntimeState* state); // This should be called before close() and after get_next(), it is responsible for // collecting statistics sent with row batch, it can't be called when prepare() returns // error. virtual Status collect_query_statistics(QueryStatistics* statistics); // close() will get called for every exec node, regardless of what else is called and // the status of these calls (i.e. prepare() may never have been called, or // prepare()/open()/get_next() returned with an error). // close() releases all resources that were allocated in open()/get_next(), even if the // latter ended with an error. close() can be called if the node has been prepared or // the node is closed. // After calling close(), the caller calls open() again prior to subsequent calls to // get_next(). The default implementation updates runtime profile counters and calls // close() on the children. To ensure that close() is called on the entire plan tree, // each implementation should start out by calling the default implementation. virtual Status close(RuntimeState* state); // Creates exec node tree from list of nodes contained in plan via depth-first // traversal. All nodes are placed in pool. // Returns error if 'plan' is corrupted, otherwise success. static Status create_tree(RuntimeState* state, ObjectPool* pool, const TPlan& plan, const DescriptorTbl& descs, ExecNode** root); // Set debug action for node with given id in 'tree' static void set_debug_options(int node_id, TExecNodePhase::type phase, TDebugAction::type action, ExecNode* tree); // Collect all nodes of given 'node_type' that are part of this subtree, and return in // 'nodes'. void collect_nodes(TPlanNodeType::type node_type, std::vector* nodes); // Collect all scan node types. void collect_scan_nodes(std::vector* nodes); // When the agg node is the scan node direct parent, // we directly return agg object from scan node to agg node, // and don't serialize the agg object. // This improve is cautious, we ensure the correctness firstly. void try_do_aggregate_serde_improve(); typedef bool (*EvalConjunctsFn)(ExprContext* const* ctxs, int num_ctxs, TupleRow* row); // Evaluate exprs over row. Returns true if all exprs return true. // TODO: This doesn't use the vector signature because I haven't figured // out how to deal with declaring a templated std:vector type in IR static bool eval_conjuncts(ExprContext* const* ctxs, int num_ctxs, TupleRow* row); // Returns a string representation in DFS order of the plan rooted at this. std::string debug_string() const; virtual void push_down_predicate(RuntimeState* state, std::list* expr_ctxs); // recursive helper method for generating a string for Debug_string(). // implementations should call debug_string(int, std::stringstream) on their children. // Input parameters: // indentation_level: Current level in plan tree. // Output parameters: // out: Stream to accumulate debug string. virtual void debug_string(int indentation_level, std::stringstream* out) const; const std::vector& conjunct_ctxs() const { return _conjunct_ctxs; } int id() const { return _id; } TPlanNodeType::type type() const { return _type; } const RowDescriptor& row_desc() const { return _row_descriptor; } int64_t rows_returned() const { return _num_rows_returned; } int64_t limit() const { return _limit; } bool reached_limit() { return _limit != -1 && _num_rows_returned >= _limit; } const std::vector& get_tuple_ids() const { return _tuple_ids; } RuntimeProfile* runtime_profile() { return _runtime_profile.get(); } RuntimeProfile::Counter* memory_used_counter() const { return _memory_used_counter; } MemTracker* mem_tracker() const { return _mem_tracker.get(); } MemTracker* expr_mem_tracker() const { return _expr_mem_tracker.get(); } MemPool* expr_mem_pool() { return _expr_mem_pool.get(); } // Extract node id from p->name(). static int get_node_id_from_profile(RuntimeProfile* p); // Names of counters shared by all exec nodes static const std::string ROW_THROUGHPUT_COUNTER; protected: friend class DataSink; /// Initialize 'buffer_pool_client_' and claim the initial reservation for this /// ExecNode. Only needs to be called by ExecNodes that will use the client. /// The client is automatically cleaned up in Close(). Should not be called if /// the client is already open. /// The ExecNode must return the initial reservation to /// QueryState::initial_reservations(), which is done automatically in Close() as long /// as the initial reservation is not released before Close(). Status claim_buffer_reservation(RuntimeState* state); /// Release any unused reservation in excess of the node's initial reservation. Returns /// an error if releasing the reservation requires flushing pages to disk, and that /// fails. Status release_unused_reservation(); /// Enable the increase reservation denial probability on 'buffer_pool_client_' based on /// the 'debug_action_' set on this node. Returns an error if 'debug_action_param_' is /// invalid. //Status enable_deny_reservation_debug_action(); /// Extends blocking queue for row batches. Row batches have a property that /// they must be processed in the order they were produced, even in cancellation /// paths. Preceding row batches can contain ptrs to memory in subsequent row batches /// and we need to make sure those ptrs stay valid. /// Row batches that are added after Shutdown() are queued in another queue, which can /// be cleaned up during Close(). /// All functions are thread safe. class RowBatchQueue : public BlockingQueue { public: /// max_batches is the maximum number of row batches that can be queued. /// When the queue is full, producers will block. RowBatchQueue(int max_batches); ~RowBatchQueue(); /// Adds a batch to the queue. This is blocking if the queue is full. void AddBatch(RowBatch* batch); /// Adds a batch to the queue. If the queue is full, this blocks until space becomes /// available or 'timeout_micros' has elapsed. /// Returns true if the element was added to the queue, false if it wasn't. If this /// method returns false, the queue didn't take ownership of the batch and it must be /// managed externally. bool AddBatchWithTimeout(RowBatch* batch, int64_t timeout_micros); /// Gets a row batch from the queue. Returns NULL if there are no more. /// This function blocks. /// Returns NULL after Shutdown(). RowBatch* GetBatch(); /// Deletes all row batches in cleanup_queue_. Not valid to call AddBatch() /// after this is called. /// Returns the number of io buffers that were released (for debug tracking) int Cleanup(); private: /// Lock protecting cleanup_queue_ // SpinLock lock_; // TODO(dhc): need to modify spinlock std::mutex lock_; /// Queue of orphaned row batches std::list cleanup_queue_; }; int _id; // unique w/in single plan tree TPlanNodeType::type _type; ObjectPool* _pool; std::vector _conjuncts; std::vector _conjunct_ctxs; std::vector _tuple_ids; std::vector _children; RowDescriptor _row_descriptor; /// Resource information sent from the frontend. const TBackendResourceProfile _resource_profile; // debug-only: if _debug_action is not INVALID, node will perform action in // _debug_phase TExecNodePhase::type _debug_phase; TDebugAction::type _debug_action; int64_t _limit; // -1: no limit int64_t _num_rows_returned; boost::scoped_ptr _runtime_profile; /// Account for peak memory used by this node boost::scoped_ptr _mem_tracker; /// MemTracker used by 'expr_mem_pool_'. boost::scoped_ptr _expr_mem_tracker; /// MemPool for allocating data structures used by expression evaluators in this node. /// Created in Prepare(). boost::scoped_ptr _expr_mem_pool; RuntimeProfile::Counter* _rows_returned_counter; RuntimeProfile::Counter* _rows_returned_rate; // Account for peak memory used by this node RuntimeProfile::Counter* _memory_used_counter; // Execution options that are determined at runtime. This is added to the // runtime profile at close(). Examples for options logged here would be // "Codegen Enabled" boost::mutex _exec_options_lock; std::string _runtime_exec_options; /// Buffer pool client for this node. Initialized with the node's minimum reservation /// in ClaimBufferReservation(). After initialization, the client must hold onto at /// least the minimum reservation so that it can be returned to the initial /// reservations pool in Close(). BufferPool::ClientHandle _buffer_pool_client; ExecNode* child(int i) { return _children[i]; } bool is_closed() const { return _is_closed; } // TODO(zc) /// Pointer to the containing SubplanNode or NULL if not inside a subplan. /// Set by SubplanNode::Init(). Not owned. // SubplanNode* containing_subplan_; /// Returns true if this node is inside the right-hand side plan tree of a SubplanNode. /// Valid to call in or after Prepare(). bool is_in_subplan() const { return false; } // Create a single exec node derived from thrift node; place exec node in 'pool'. static Status create_node(RuntimeState* state, ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs, ExecNode** node); static Status create_tree_helper(RuntimeState* state, ObjectPool* pool, const std::vector& tnodes, const DescriptorTbl& descs, ExecNode* parent, int* node_idx, ExecNode** root); virtual bool is_scan_node() const { return false; } void init_runtime_profile(const std::string& name); // Executes _debug_action if phase matches _debug_phase. // 'phase' must not be INVALID. Status exec_debug_action(TExecNodePhase::type phase); // Appends option to '_runtime_exec_options' void add_runtime_exec_option(const std::string& option); /// Frees any local allocations made by evals_to_free_ and returns the result of /// state->CheckQueryState(). Nodes should call this periodically, e.g. once per input /// row batch. This should not be called outside the main execution thread. // /// Nodes may override this to add extra periodic cleanup, e.g. freeing other local /// allocations. ExecNodes overriding this function should return /// ExecNode::QueryMaintenance(). virtual Status QueryMaintenance(RuntimeState* state, const std::string& msg) WARN_UNUSED_RESULT; private: bool _is_closed; }; #define LIMIT_EXCEEDED(tracker, state, msg) \ do { \ stringstream str; \ str << "Memory exceed limit. " << msg << " "; \ str << "Backend: " << BackendOptions::get_localhost() << ", "; \ str << "fragment: " << print_id(state->fragment_instance_id()) << " "; \ str << "Used: " << tracker->consumption() << ", Limit: " << tracker->limit() << ". "; \ str << "You can change the limit by session variable exec_mem_limit."; \ return Status::MemoryLimitExceeded(str.str()); \ } while (false) #define RETURN_IF_LIMIT_EXCEEDED(state, msg) \ do { \ /* if (UNLIKELY(MemTracker::limit_exceeded(*(state)->mem_trackers()))) { */ \ MemTracker* tracker = state->instance_mem_tracker()->find_limit_exceeded_tracker(); \ if (tracker != nullptr) { \ LIMIT_EXCEEDED(tracker, state, msg); \ } \ } while (false) } #endif