doris/be/src/vec/exec/scan/scanner_context.h

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include <bthread/types.h>
#include <stdint.h>

#include <atomic>
#include <list>
#include <memory>
#include <mutex>
#include <string>
#include <vector>

#include "common/factory_creator.h"
#include "common/status.h"
#include "concurrentqueue.h"
#include "util/lock.h"
#include "util/runtime_profile.h"
#include "vec/core/block.h"
#include "vec/exec/scan/vscanner.h"

namespace doris {

class ThreadPoolToken;
class RuntimeState;
class TupleDescriptor;

namespace pipeline {
class ScanLocalStateBase;
class ScannerDoneDependency;
class FinishDependency;
class DataReadyDependency;
} // namespace pipeline

namespace taskgroup {
class TaskGroup;
} // namespace taskgroup

namespace vectorized {

class VScanner;
class VScanNode;
class ScannerScheduler;
class SimplifiedScanScheduler;

// ScannerContext is responsible for recording the execution status
// of a group of Scanners corresponding to a ScanNode.
// Including how many scanners are being scheduled, and maintaining
// a producer-consumer blocks queue between scanners and scan nodes.
//
// ScannerContext is also the scheduling unit of ScannerScheduler.
// ScannerScheduler schedules a ScannerContext at a time,
// and submits the Scanners to the scanner thread pool for data scanning.
class ScannerContext {
    ENABLE_FACTORY_CREATOR(ScannerContext);

public:
    ScannerContext(RuntimeState* state_, VScanNode* parent,
                   const TupleDescriptor* output_tuple_desc,
                   const std::list<VScannerSPtr>& scanners_, int64_t limit_,
                   int64_t max_bytes_in_blocks_queue_, const int num_parallel_instances = 1,
                   pipeline::ScanLocalStateBase* local_state = nullptr);

    virtual ~ScannerContext() = default;
    virtual Status init();

    vectorized::BlockUPtr get_free_block();
    void return_free_block(std::unique_ptr<vectorized::Block> block);

    // Append blocks from scanners to the blocks queue.
    virtual void append_blocks_to_queue(std::vector<vectorized::BlockUPtr>& blocks);
    // Get next block from blocks queue. Called by ScanNode
    // Set eos to true if there is no more data to read.
    // And if eos is true, the block returned must be nullptr.
    virtual Status get_block_from_queue(RuntimeState* state, vectorized::BlockUPtr* block,
                                        bool* eos, int id, bool wait = true);

    [[nodiscard]] Status validate_block_schema(Block* block);

    // When a scanner complete a scan, this method will be called
    // to return the scanner to the list for next scheduling.
    void push_back_scanner_and_reschedule(VScannerSPtr scanner);

    bool set_status_on_error(const Status& status, bool need_lock = true);

    Status status() {
        if (_process_status.is<ErrorCode::END_OF_FILE>()) {
            return Status::OK();
        }
        return _process_status;
    }

    virtual void set_dependency(
            std::shared_ptr<pipeline::DataReadyDependency> dependency,
            std::shared_ptr<pipeline::ScannerDoneDependency> scanner_done_dependency,
            std::shared_ptr<pipeline::FinishDependency> finish_dependency) {}

    // Called by ScanNode.
    // Used to notify the scheduler that this ScannerContext can stop working.
    void set_should_stop();

    // Return true if this ScannerContext need no more process
    virtual bool done() { return _is_finished || _should_stop; }

    // Update the running num of scanners and contexts
    void update_num_running(int32_t scanner_inc, int32_t sched_inc);

    int get_num_running_scanners() const { return _num_running_scanners; }

    int get_num_scheduling_ctx() const { return _num_scheduling_ctx; }

    void get_next_batch_of_scanners(std::list<VScannerSPtr>* current_run);

    template <typename Parent>
    void clear_and_join(Parent* parent, RuntimeState* state);

    bool no_schedule();

    virtual std::string debug_string();

    RuntimeState* state() { return _state; }

    void incr_num_ctx_scheduling(int64_t num) { _scanner_ctx_sched_counter->update(num); }
    void incr_ctx_scheduling_time(int64_t num) { _scanner_ctx_sched_time->update(num); }
    void incr_num_scanner_scheduling(int64_t num) { _scanner_sched_counter->update(num); }

    std::string parent_name();

    virtual bool empty_in_queue(int id);

    // todo(wb) rethinking how to calculate ```_max_bytes_in_queue``` when executing shared scan
    inline bool should_be_scheduled() const {
        return (_cur_bytes_in_queue < _max_bytes_in_queue / 2) &&
               (_serving_blocks_num < allowed_blocks_num());
    }

    int get_available_thread_slot_num() {
        int thread_slot_num = 0;
        thread_slot_num = (allowed_blocks_num() + _block_per_scanner - 1) / _block_per_scanner;
        thread_slot_num = std::min(thread_slot_num, _max_thread_num - _num_running_scanners);
        return thread_slot_num;
    }

    int32_t allowed_blocks_num() const {
        int32_t blocks_num = std::min(_free_blocks_capacity,
                                      int32_t((_max_bytes_in_queue + _estimated_block_bytes - 1) /
                                              _estimated_block_bytes));
        return blocks_num;
    }

    taskgroup::TaskGroup* get_task_group() const;
    SimplifiedScanScheduler* get_simple_scan_scheduler() { return _simple_scan_scheduler; }

    void reschedule_scanner_ctx();

    // the unique id of this context
    std::string ctx_id;
    int32_t queue_idx = -1;
    ThreadPoolToken* thread_token = nullptr;
    std::vector<bthread_t> _btids;

    bool _should_reset_thread_name = true;

private:
    template <typename Parent>
    Status _close_and_clear_scanners(Parent* parent, RuntimeState* state);

protected:
    virtual void _dispose_coloate_blocks_not_in_queue() {}

    RuntimeState* _state;
    VScanNode* _parent;
    pipeline::ScanLocalStateBase* _local_state;

    // the comment of same fields in VScanNode
    const TupleDescriptor* _output_tuple_desc;

    // _transfer_lock is used to protect the critical section
    // where the ScanNode and ScannerScheduler interact.
    // Including access to variables such as blocks_queue, _process_status, _is_finished, etc.
    doris::Mutex _transfer_lock;
    // The blocks got from scanners will be added to the "blocks_queue".
    // And the upper scan node will be as a consumer to fetch blocks from this queue.
    // Should be protected by "_transfer_lock"
    std::list<vectorized::BlockUPtr> _blocks_queue;
    // Wait in get_block_from_queue(), by ScanNode.
    doris::ConditionVariable _blocks_queue_added_cv;
    // Wait in clear_and_join(), by ScanNode.
    doris::ConditionVariable _ctx_finish_cv;

    // The following 3 variables control the process of the scanner scheduling.
    // Use _transfer_lock to protect them.
    // 1. _process_status
    //      indicates the global status of this scanner context.
    //      Set to non-ok if encounter errors.
    //      And if it is non-ok, the scanner process should stop.
    //      Set be set by either ScanNode or ScannerScheduler.
    // 2. _should_stop
    //      Always be set by ScanNode.
    //      True means no more data need to be read(reach limit or closed)
    // 3. _is_finished
    //      Always be set by ScannerScheduler.
    //      True means all scanners are finished to scan.
    Status _process_status;
    std::atomic_bool _status_error = false;
    std::atomic_bool _should_stop = false;
    std::atomic_bool _is_finished = false;

    // Lazy-allocated blocks for all scanners to share, for memory reuse.
    moodycamel::ConcurrentQueue<vectorized::BlockUPtr> _free_blocks;
    std::atomic<int32_t> _serving_blocks_num = 0;
    // The current number of free blocks available to the scanners.
    // Used to limit the memory usage of the scanner.
    // NOTE: this is NOT the size of `_free_blocks`.
    int32_t _free_blocks_capacity = 0;
    int64_t _estimated_block_bytes = 0;

    int _batch_size;
    // The limit from SQL's limit clause
    int64_t limit;

    // Current number of running scanners.
    std::atomic_int32_t _num_running_scanners = 0;
    // Current number of ctx being scheduled.
    // After each Scanner finishes a task, it will put the corresponding ctx
    // back into the scheduling queue.
    // Therefore, there will be multiple pointer of same ctx in the scheduling queue.
    // Here we record the number of ctx in the scheduling  queue to clean up at the end.
    std::atomic_int32_t _num_scheduling_ctx = 0;
    // Num of unfinished scanners. Should be set in init()
    std::atomic_int32_t _num_unfinished_scanners = 0;
    // Max number of scan thread for this scanner context.
    int32_t _max_thread_num = 0;
    // How many blocks a scanner can use in one task.
    int32_t _block_per_scanner = 0;

    // The current bytes of blocks in blocks queue
    int64_t _cur_bytes_in_queue = 0;
    // The max limit bytes of blocks in blocks queue
    const int64_t _max_bytes_in_queue;

    doris::vectorized::ScannerScheduler* _scanner_scheduler;
    SimplifiedScanScheduler* _simple_scan_scheduler = nullptr; // used for cpu hard limit
    // List "scanners" saves all "unfinished" scanners.
    // The scanner scheduler will pop scanners from this list, run scanner,
    // and then if the scanner is not finished, will be pushed back to this list.
    // Not need to protect by lock, because only one scheduler thread will access to it.
    doris::Mutex _scanners_lock;
    std::list<VScannerSPtr> _scanners;
    std::vector<int64_t> _finished_scanner_runtime;
    std::vector<int64_t> _finished_scanner_rows_read;
    std::vector<int64_t> _finished_scanner_wait_worker_time;

    const int _num_parallel_instances;

    std::shared_ptr<RuntimeProfile> _scanner_profile;
    RuntimeProfile::Counter* _scanner_sched_counter = nullptr;
    RuntimeProfile::Counter* _scanner_ctx_sched_counter = nullptr;
    RuntimeProfile::Counter* _scanner_ctx_sched_time = nullptr;
    RuntimeProfile::HighWaterMarkCounter* _free_blocks_memory_usage = nullptr;
    RuntimeProfile::HighWaterMarkCounter* _queued_blocks_memory_usage = nullptr;
    RuntimeProfile::Counter* _newly_create_free_blocks_num = nullptr;
    RuntimeProfile::Counter* _scanner_wait_batch_timer = nullptr;

    std::shared_ptr<pipeline::ScannerDoneDependency> _scanner_done_dependency = nullptr;
    std::shared_ptr<pipeline::FinishDependency> _finish_dependency = nullptr;
};
} // namespace vectorized
} // namespace doris