Files
doris/be/src/vec/exec/scan/scanner_context.h
wangbo 54780c62e0 [improvement](executor)Using cgroup to implement cpu hard limit (#25489)
* Using cgroup to implement cpu hard limit

* code style
2023-10-19 18:56:26 +08:00

286 lines
11 KiB
C++

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <bthread/types.h>
#include <stdint.h>
#include <atomic>
#include <list>
#include <memory>
#include <mutex>
#include <string>
#include <vector>
#include "common/factory_creator.h"
#include "common/status.h"
#include "concurrentqueue.h"
#include "util/lock.h"
#include "util/runtime_profile.h"
#include "vec/core/block.h"
#include "vec/exec/scan/vscanner.h"
namespace doris {
class ThreadPoolToken;
class RuntimeState;
class TupleDescriptor;
namespace pipeline {
class ScanLocalStateBase;
class ScannerDoneDependency;
class FinishDependency;
class DataReadyDependency;
} // namespace pipeline
namespace taskgroup {
class TaskGroup;
} // namespace taskgroup
namespace vectorized {
class VScanner;
class VScanNode;
class ScannerScheduler;
class SimplifiedScanScheduler;
// ScannerContext is responsible for recording the execution status
// of a group of Scanners corresponding to a ScanNode.
// Including how many scanners are being scheduled, and maintaining
// a producer-consumer blocks queue between scanners and scan nodes.
//
// ScannerContext is also the scheduling unit of ScannerScheduler.
// ScannerScheduler schedules a ScannerContext at a time,
// and submits the Scanners to the scanner thread pool for data scanning.
class ScannerContext {
ENABLE_FACTORY_CREATOR(ScannerContext);
public:
ScannerContext(RuntimeState* state_, VScanNode* parent,
const TupleDescriptor* output_tuple_desc,
const std::list<VScannerSPtr>& scanners_, int64_t limit_,
int64_t max_bytes_in_blocks_queue_, const int num_parallel_instances = 1,
pipeline::ScanLocalStateBase* local_state = nullptr);
virtual ~ScannerContext() = default;
virtual Status init();
vectorized::BlockUPtr get_free_block();
void return_free_block(std::unique_ptr<vectorized::Block> block);
// Append blocks from scanners to the blocks queue.
virtual void append_blocks_to_queue(std::vector<vectorized::BlockUPtr>& blocks);
// Get next block from blocks queue. Called by ScanNode
// Set eos to true if there is no more data to read.
// And if eos is true, the block returned must be nullptr.
virtual Status get_block_from_queue(RuntimeState* state, vectorized::BlockUPtr* block,
bool* eos, int id, bool wait = true);
[[nodiscard]] Status validate_block_schema(Block* block);
// When a scanner complete a scan, this method will be called
// to return the scanner to the list for next scheduling.
void push_back_scanner_and_reschedule(VScannerSPtr scanner);
bool set_status_on_error(const Status& status, bool need_lock = true);
Status status() {
if (_process_status.is<ErrorCode::END_OF_FILE>()) {
return Status::OK();
}
return _process_status;
}
virtual void set_dependency(
std::shared_ptr<pipeline::DataReadyDependency> dependency,
std::shared_ptr<pipeline::ScannerDoneDependency> scanner_done_dependency,
std::shared_ptr<pipeline::FinishDependency> finish_dependency) {}
// Called by ScanNode.
// Used to notify the scheduler that this ScannerContext can stop working.
void set_should_stop();
// Return true if this ScannerContext need no more process
virtual bool done() { return _is_finished || _should_stop; }
// Update the running num of scanners and contexts
void update_num_running(int32_t scanner_inc, int32_t sched_inc);
int get_num_running_scanners() const { return _num_running_scanners; }
int get_num_scheduling_ctx() const { return _num_scheduling_ctx; }
void get_next_batch_of_scanners(std::list<VScannerSPtr>* current_run);
template <typename Parent>
void clear_and_join(Parent* parent, RuntimeState* state);
bool no_schedule();
virtual std::string debug_string();
RuntimeState* state() { return _state; }
void incr_num_ctx_scheduling(int64_t num) { _scanner_ctx_sched_counter->update(num); }
void incr_ctx_scheduling_time(int64_t num) { _scanner_ctx_sched_time->update(num); }
void incr_num_scanner_scheduling(int64_t num) { _scanner_sched_counter->update(num); }
std::string parent_name();
virtual bool empty_in_queue(int id);
// todo(wb) rethinking how to calculate ```_max_bytes_in_queue``` when executing shared scan
inline bool should_be_scheduled() const {
return (_cur_bytes_in_queue < _max_bytes_in_queue / 2) &&
(_serving_blocks_num < allowed_blocks_num());
}
int get_available_thread_slot_num() {
int thread_slot_num = 0;
thread_slot_num = (allowed_blocks_num() + _block_per_scanner - 1) / _block_per_scanner;
thread_slot_num = std::min(thread_slot_num, _max_thread_num - _num_running_scanners);
return thread_slot_num;
}
int32_t allowed_blocks_num() const {
int32_t blocks_num = std::min(_free_blocks_capacity,
int32_t((_max_bytes_in_queue + _estimated_block_bytes - 1) /
_estimated_block_bytes));
return blocks_num;
}
taskgroup::TaskGroup* get_task_group() const;
SimplifiedScanScheduler* get_simple_scan_scheduler() { return _simple_scan_scheduler; }
void reschedule_scanner_ctx();
// the unique id of this context
std::string ctx_id;
int32_t queue_idx = -1;
ThreadPoolToken* thread_token = nullptr;
std::vector<bthread_t> _btids;
bool _should_reset_thread_name = true;
private:
template <typename Parent>
Status _close_and_clear_scanners(Parent* parent, RuntimeState* state);
protected:
virtual void _dispose_coloate_blocks_not_in_queue() {}
RuntimeState* _state;
VScanNode* _parent;
pipeline::ScanLocalStateBase* _local_state;
// the comment of same fields in VScanNode
const TupleDescriptor* _output_tuple_desc;
// _transfer_lock is used to protect the critical section
// where the ScanNode and ScannerScheduler interact.
// Including access to variables such as blocks_queue, _process_status, _is_finished, etc.
doris::Mutex _transfer_lock;
// The blocks got from scanners will be added to the "blocks_queue".
// And the upper scan node will be as a consumer to fetch blocks from this queue.
// Should be protected by "_transfer_lock"
std::list<vectorized::BlockUPtr> _blocks_queue;
// Wait in get_block_from_queue(), by ScanNode.
doris::ConditionVariable _blocks_queue_added_cv;
// Wait in clear_and_join(), by ScanNode.
doris::ConditionVariable _ctx_finish_cv;
// The following 3 variables control the process of the scanner scheduling.
// Use _transfer_lock to protect them.
// 1. _process_status
// indicates the global status of this scanner context.
// Set to non-ok if encounter errors.
// And if it is non-ok, the scanner process should stop.
// Set be set by either ScanNode or ScannerScheduler.
// 2. _should_stop
// Always be set by ScanNode.
// True means no more data need to be read(reach limit or closed)
// 3. _is_finished
// Always be set by ScannerScheduler.
// True means all scanners are finished to scan.
Status _process_status;
std::atomic_bool _status_error = false;
std::atomic_bool _should_stop = false;
std::atomic_bool _is_finished = false;
// Lazy-allocated blocks for all scanners to share, for memory reuse.
moodycamel::ConcurrentQueue<vectorized::BlockUPtr> _free_blocks;
std::atomic<int32_t> _serving_blocks_num = 0;
// The current number of free blocks available to the scanners.
// Used to limit the memory usage of the scanner.
// NOTE: this is NOT the size of `_free_blocks`.
int32_t _free_blocks_capacity = 0;
int64_t _estimated_block_bytes = 0;
int _batch_size;
// The limit from SQL's limit clause
int64_t limit;
// Current number of running scanners.
std::atomic_int32_t _num_running_scanners = 0;
// Current number of ctx being scheduled.
// After each Scanner finishes a task, it will put the corresponding ctx
// back into the scheduling queue.
// Therefore, there will be multiple pointer of same ctx in the scheduling queue.
// Here we record the number of ctx in the scheduling queue to clean up at the end.
std::atomic_int32_t _num_scheduling_ctx = 0;
// Num of unfinished scanners. Should be set in init()
std::atomic_int32_t _num_unfinished_scanners = 0;
// Max number of scan thread for this scanner context.
int32_t _max_thread_num = 0;
// How many blocks a scanner can use in one task.
int32_t _block_per_scanner = 0;
// The current bytes of blocks in blocks queue
int64_t _cur_bytes_in_queue = 0;
// The max limit bytes of blocks in blocks queue
const int64_t _max_bytes_in_queue;
doris::vectorized::ScannerScheduler* _scanner_scheduler;
SimplifiedScanScheduler* _simple_scan_scheduler = nullptr; // used for cpu hard limit
// List "scanners" saves all "unfinished" scanners.
// The scanner scheduler will pop scanners from this list, run scanner,
// and then if the scanner is not finished, will be pushed back to this list.
// Not need to protect by lock, because only one scheduler thread will access to it.
doris::Mutex _scanners_lock;
std::list<VScannerSPtr> _scanners;
std::vector<int64_t> _finished_scanner_runtime;
std::vector<int64_t> _finished_scanner_rows_read;
std::vector<int64_t> _finished_scanner_wait_worker_time;
const int _num_parallel_instances;
std::shared_ptr<RuntimeProfile> _scanner_profile;
RuntimeProfile::Counter* _scanner_sched_counter = nullptr;
RuntimeProfile::Counter* _scanner_ctx_sched_counter = nullptr;
RuntimeProfile::Counter* _scanner_ctx_sched_time = nullptr;
RuntimeProfile::HighWaterMarkCounter* _free_blocks_memory_usage = nullptr;
RuntimeProfile::HighWaterMarkCounter* _queued_blocks_memory_usage = nullptr;
RuntimeProfile::Counter* _newly_create_free_blocks_num = nullptr;
RuntimeProfile::Counter* _scanner_wait_batch_timer = nullptr;
std::shared_ptr<pipeline::ScannerDoneDependency> _scanner_done_dependency = nullptr;
std::shared_ptr<pipeline::FinishDependency> _finish_dependency = nullptr;
};
} // namespace vectorized
} // namespace doris