[refactor](scanner) refactoring and optimizing scanner scheduling (#30746)
This commit is contained in:
@ -38,8 +38,10 @@ Status FileScanLocalState::_init_scanners(std::list<vectorized::VScannerSPtr>* s
|
||||
}
|
||||
|
||||
auto& p = _parent->cast<FileScanOperatorX>();
|
||||
size_t shard_num =
|
||||
std::min<size_t>(config::doris_scanner_thread_pool_thread_num, _scan_ranges.size());
|
||||
size_t shard_num = std::min<size_t>(
|
||||
config::doris_scanner_thread_pool_thread_num / state()->query_parallel_instance_num(),
|
||||
_scan_ranges.size());
|
||||
shard_num = std::max(shard_num, (size_t)1);
|
||||
_kv_cache.reset(new vectorized::ShardedKVCache(shard_num));
|
||||
for (auto& scan_range : _scan_ranges) {
|
||||
std::unique_ptr<vectorized::VFileScanner> scanner = vectorized::VFileScanner::create_unique(
|
||||
@ -62,7 +64,7 @@ void FileScanLocalState::set_scan_ranges(RuntimeState* state,
|
||||
const std::vector<TScanRangeParams>& scan_ranges) {
|
||||
int max_scanners =
|
||||
config::doris_scanner_thread_pool_thread_num / state->query_parallel_instance_num();
|
||||
max_scanners = max_scanners == 0 ? 1 : max_scanners;
|
||||
max_scanners = std::max(std::max(max_scanners, state->parallel_scan_max_scanners_count()), 1);
|
||||
// For select * from table limit 10; should just use one thread.
|
||||
if (should_run_serial()) {
|
||||
max_scanners = 1;
|
||||
|
||||
@ -160,7 +160,6 @@ Status ScanLocalState<Derived>::open(RuntimeState* state) {
|
||||
if (_scanner_ctx) {
|
||||
DCHECK(!_eos && _num_scanners->value() > 0);
|
||||
RETURN_IF_ERROR(_scanner_ctx->init());
|
||||
RETURN_IF_ERROR(state->exec_env()->scanner_scheduler()->submit(_scanner_ctx));
|
||||
}
|
||||
_opened = true;
|
||||
return status;
|
||||
@ -1288,16 +1287,14 @@ Status ScanLocalState<Derived>::_init_profile() {
|
||||
profile()->add_child(_scanner_profile.get(), true, nullptr);
|
||||
|
||||
_memory_usage_counter = ADD_LABEL_COUNTER_WITH_LEVEL(_scanner_profile, "MemoryUsage", 1);
|
||||
_queued_blocks_memory_usage = _scanner_profile->AddHighWaterMarkCounter(
|
||||
"QueuedBlocks", TUnit::BYTES, "MemoryUsage", 1);
|
||||
_free_blocks_memory_usage =
|
||||
_scanner_profile->AddHighWaterMarkCounter("FreeBlocks", TUnit::BYTES, "MemoryUsage", 1);
|
||||
_newly_create_free_blocks_num =
|
||||
ADD_COUNTER(_scanner_profile, "NewlyCreateFreeBlocksNum", TUnit::UNIT);
|
||||
_scale_up_scanners_counter = ADD_COUNTER(_scanner_profile, "NumScaleUpScanners", TUnit::UNIT);
|
||||
// time of transfer thread to wait for block from scan thread
|
||||
_scanner_wait_batch_timer = ADD_TIMER(_scanner_profile, "ScannerBatchWaitTime");
|
||||
_scanner_sched_counter = ADD_COUNTER(_scanner_profile, "ScannerSchedCount", TUnit::UNIT);
|
||||
_scanner_ctx_sched_counter = ADD_COUNTER(_scanner_profile, "ScannerCtxSchedCount", TUnit::UNIT);
|
||||
_scanner_ctx_sched_time = ADD_TIMER(_scanner_profile, "ScannerCtxSchedTime");
|
||||
|
||||
_scan_timer = ADD_TIMER(_scanner_profile, "ScannerGetBlockTime");
|
||||
@ -1456,14 +1453,10 @@ Status ScanOperatorX<LocalStateType>::get_block(RuntimeState* state, vectorized:
|
||||
}};
|
||||
|
||||
if (state->is_cancelled()) {
|
||||
// ISSUE: https://github.com/apache/doris/issues/16360
|
||||
// _scanner_ctx may be null here, see: `VScanNode::alloc_resource` (_eos == null)
|
||||
if (local_state._scanner_ctx) {
|
||||
local_state._scanner_ctx->set_status_on_error(Status::Cancelled("query cancelled"));
|
||||
return local_state._scanner_ctx->status();
|
||||
} else {
|
||||
return Status::Cancelled("query cancelled");
|
||||
local_state._scanner_ctx->stop_scanners(state);
|
||||
}
|
||||
return Status::Cancelled("Query cancelled in ScanOperator");
|
||||
}
|
||||
|
||||
if (local_state._eos) {
|
||||
@ -1471,21 +1464,11 @@ Status ScanOperatorX<LocalStateType>::get_block(RuntimeState* state, vectorized:
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
vectorized::BlockUPtr scan_block = nullptr;
|
||||
bool eos = false;
|
||||
RETURN_IF_ERROR(local_state._scanner_ctx->get_block_from_queue(state, &scan_block, &eos, 0));
|
||||
if (eos) {
|
||||
source_state = SourceState::FINISHED;
|
||||
DCHECK(scan_block == nullptr);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// get scanner's block memory
|
||||
block->swap(*scan_block);
|
||||
local_state._scanner_ctx->return_free_block(std::move(scan_block));
|
||||
RETURN_IF_ERROR(local_state._scanner_ctx->get_block_from_queue(state, block, &eos, 0));
|
||||
|
||||
local_state.reached_limit(block, source_state);
|
||||
if (eos) {
|
||||
if (eos || source_state == SourceState::FINISHED) {
|
||||
source_state = SourceState::FINISHED;
|
||||
// reach limit, stop the scanners.
|
||||
local_state._scanner_ctx->stop_scanners(state);
|
||||
|
||||
@ -142,7 +142,6 @@ protected:
|
||||
|
||||
std::shared_ptr<RuntimeProfile> _scanner_profile;
|
||||
RuntimeProfile::Counter* _scanner_sched_counter = nullptr;
|
||||
RuntimeProfile::Counter* _scanner_ctx_sched_counter = nullptr;
|
||||
RuntimeProfile::Counter* _scanner_ctx_sched_time = nullptr;
|
||||
RuntimeProfile::Counter* _scanner_wait_batch_timer = nullptr;
|
||||
RuntimeProfile::Counter* _scanner_wait_worker_timer = nullptr;
|
||||
@ -160,8 +159,8 @@ protected:
|
||||
// time of filter output block from scanner
|
||||
RuntimeProfile::Counter* _filter_timer = nullptr;
|
||||
RuntimeProfile::Counter* _memory_usage_counter = nullptr;
|
||||
RuntimeProfile::HighWaterMarkCounter* _queued_blocks_memory_usage = nullptr;
|
||||
RuntimeProfile::HighWaterMarkCounter* _free_blocks_memory_usage = nullptr;
|
||||
RuntimeProfile::Counter* _scale_up_scanners_counter = nullptr;
|
||||
// rows read from the scanner (including those discarded by (pre)filters)
|
||||
RuntimeProfile::Counter* _rows_read_counter = nullptr;
|
||||
|
||||
|
||||
@ -128,7 +128,13 @@ public:
|
||||
: _query_options.query_timeout;
|
||||
}
|
||||
int max_io_buffers() const { return _query_options.max_io_buffers; }
|
||||
int num_scanner_threads() const { return _query_options.num_scanner_threads; }
|
||||
int num_scanner_threads() const {
|
||||
return _query_options.__isset.num_scanner_threads ? _query_options.num_scanner_threads : 0;
|
||||
}
|
||||
double scanner_scale_up_ratio() const {
|
||||
return _query_options.__isset.scanner_scale_up_ratio ? _query_options.scanner_scale_up_ratio
|
||||
: 0;
|
||||
}
|
||||
TQueryType::type query_type() const { return _query_options.query_type; }
|
||||
int64_t timestamp_ms() const { return _timestamp_ms; }
|
||||
int32_t nano_seconds() const { return _nano_seconds; }
|
||||
|
||||
@ -62,7 +62,7 @@ void NewFileScanNode::set_scan_ranges(RuntimeState* state,
|
||||
const std::vector<TScanRangeParams>& scan_ranges) {
|
||||
int max_scanners =
|
||||
config::doris_scanner_thread_pool_thread_num / state->query_parallel_instance_num();
|
||||
max_scanners = max_scanners == 0 ? 1 : max_scanners;
|
||||
max_scanners = std::max(std::max(max_scanners, state->parallel_scan_max_scanners_count()), 1);
|
||||
// For select * from table limit 10; should just use one thread.
|
||||
if (should_run_serial()) {
|
||||
max_scanners = 1;
|
||||
@ -116,9 +116,10 @@ Status NewFileScanNode::_init_scanners(std::list<VScannerSPtr>* scanners) {
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// TODO: determine kv cache shard num
|
||||
size_t shard_num =
|
||||
std::min<size_t>(config::doris_scanner_thread_pool_thread_num, _scan_ranges.size());
|
||||
size_t shard_num = std::min<size_t>(
|
||||
config::doris_scanner_thread_pool_thread_num / _state->query_parallel_instance_num(),
|
||||
_scan_ranges.size());
|
||||
shard_num = std::max(shard_num, (size_t)1);
|
||||
_kv_cache.reset(new ShardedKVCache(shard_num));
|
||||
for (auto& scan_range : _scan_ranges) {
|
||||
std::unique_ptr<VFileScanner> scanner =
|
||||
|
||||
@ -36,129 +36,6 @@ public:
|
||||
: vectorized::ScannerContext(state, parent, output_tuple_desc, output_row_descriptor,
|
||||
scanners, limit_, max_bytes_in_blocks_queue,
|
||||
num_parallel_instances) {}
|
||||
|
||||
Status get_block_from_queue(RuntimeState* state, vectorized::BlockUPtr* block, bool* eos,
|
||||
int id) override {
|
||||
{
|
||||
std::unique_lock l(_transfer_lock);
|
||||
if (state->is_cancelled()) {
|
||||
set_status_on_error(Status::Cancelled("cancelled"), false);
|
||||
}
|
||||
|
||||
if (!status().ok()) {
|
||||
return _process_status;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<vectorized::BlockUPtr> merge_blocks;
|
||||
{
|
||||
std::unique_lock<std::mutex> l(*_queue_mutexs[id]);
|
||||
// The pipeline maybe wake up by scanner.done. If there are still any data
|
||||
// in the queue, should read the data first and then check if the scanner.done
|
||||
// if done, then eos is returned to indicate that the scan operator finished.
|
||||
if (_blocks_queues[id].empty()) {
|
||||
*eos = done();
|
||||
return Status::OK();
|
||||
}
|
||||
if (_process_status.is<ErrorCode::CANCELLED>()) {
|
||||
*eos = true;
|
||||
return Status::OK();
|
||||
}
|
||||
*block = std::move(_blocks_queues[id].front());
|
||||
_blocks_queues[id].pop_front();
|
||||
|
||||
auto rows = (*block)->rows();
|
||||
while (!_blocks_queues[id].empty()) {
|
||||
const auto add_rows = (*_blocks_queues[id].front()).rows();
|
||||
if (rows + add_rows < state->batch_size()) {
|
||||
rows += add_rows;
|
||||
merge_blocks.emplace_back(std::move(_blocks_queues[id].front()));
|
||||
_blocks_queues[id].pop_front();
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (_blocks_queues[id].empty()) {
|
||||
this->reschedule_scanner_ctx();
|
||||
}
|
||||
}
|
||||
|
||||
_current_used_bytes -= (*block)->allocated_bytes();
|
||||
if (!merge_blocks.empty()) {
|
||||
vectorized::MutableBlock m(block->get());
|
||||
for (auto& merge_block : merge_blocks) {
|
||||
_current_used_bytes -= merge_block->allocated_bytes();
|
||||
static_cast<void>(m.merge(*merge_block));
|
||||
return_free_block(std::move(merge_block));
|
||||
}
|
||||
(*block)->set_columns(std::move(m.mutable_columns()));
|
||||
}
|
||||
|
||||
// after return free blocks, should try to reschedule the scanner
|
||||
if (should_be_scheduled()) {
|
||||
this->reschedule_scanner_ctx();
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void append_blocks_to_queue(std::vector<vectorized::BlockUPtr>& blocks) override {
|
||||
const int queue_size = _blocks_queues.size();
|
||||
const int block_size = blocks.size();
|
||||
if (block_size == 0) {
|
||||
return;
|
||||
}
|
||||
int64_t local_bytes = 0;
|
||||
|
||||
for (const auto& block : blocks) {
|
||||
auto st = validate_block_schema(block.get());
|
||||
if (!st.ok()) {
|
||||
set_status_on_error(st, false);
|
||||
}
|
||||
local_bytes += block->allocated_bytes();
|
||||
}
|
||||
|
||||
for (int i = 0; i < queue_size && i < block_size; ++i) {
|
||||
int queue = _next_queue_to_feed;
|
||||
{
|
||||
std::lock_guard<std::mutex> l(*_queue_mutexs[queue]);
|
||||
for (int j = i; j < block_size; j += queue_size) {
|
||||
_blocks_queues[queue].emplace_back(std::move(blocks[j]));
|
||||
}
|
||||
}
|
||||
_next_queue_to_feed = queue + 1 < queue_size ? queue + 1 : 0;
|
||||
}
|
||||
_current_used_bytes += local_bytes;
|
||||
}
|
||||
|
||||
bool empty_in_queue(int id) override {
|
||||
std::unique_lock<std::mutex> l(*_queue_mutexs[id]);
|
||||
return _blocks_queues[id].empty();
|
||||
}
|
||||
|
||||
Status init() override {
|
||||
for (int i = 0; i < _num_parallel_instances; ++i) {
|
||||
_queue_mutexs.emplace_back(std::make_unique<std::mutex>());
|
||||
_blocks_queues.emplace_back(std::list<vectorized::BlockUPtr>());
|
||||
}
|
||||
return ScannerContext::init();
|
||||
}
|
||||
|
||||
std::string debug_string() override {
|
||||
auto res = ScannerContext::debug_string();
|
||||
for (int i = 0; i < _blocks_queues.size(); ++i) {
|
||||
res += " queue " + std::to_string(i) + ":size " +
|
||||
std::to_string(_blocks_queues[i].size());
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
protected:
|
||||
int _next_queue_to_feed = 0;
|
||||
std::vector<std::unique_ptr<std::mutex>> _queue_mutexs;
|
||||
std::vector<std::list<vectorized::BlockUPtr>> _blocks_queues;
|
||||
std::atomic_int64_t _current_used_bytes = 0;
|
||||
};
|
||||
|
||||
class PipXScannerContext final : public vectorized::ScannerContext {
|
||||
@ -172,117 +49,38 @@ public:
|
||||
int64_t limit_, int64_t max_bytes_in_blocks_queue,
|
||||
std::shared_ptr<pipeline::ScanDependency> dependency)
|
||||
: vectorized::ScannerContext(state, output_tuple_desc, output_row_descriptor, scanners,
|
||||
limit_, max_bytes_in_blocks_queue, 1, local_state,
|
||||
dependency) {}
|
||||
Status get_block_from_queue(RuntimeState* state, vectorized::BlockUPtr* block, bool* eos,
|
||||
int id) override {
|
||||
if (_blocks_queue_buffered.empty()) {
|
||||
std::unique_lock l(_transfer_lock);
|
||||
if (state->is_cancelled()) {
|
||||
set_status_on_error(Status::Cancelled("cancelled"), false);
|
||||
}
|
||||
|
||||
if (!status().ok()) {
|
||||
return _process_status;
|
||||
}
|
||||
|
||||
if (_blocks_queue.empty()) {
|
||||
*eos = done();
|
||||
return Status::OK();
|
||||
}
|
||||
if (_process_status.is<ErrorCode::CANCELLED>()) {
|
||||
*eos = true;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
_blocks_queue_buffered = std::move(_blocks_queue);
|
||||
}
|
||||
|
||||
// `get_block_from_queue` should not be called concurrently from multiple threads,
|
||||
// so here no need to lock.
|
||||
*block = std::move(_blocks_queue_buffered.front());
|
||||
_blocks_queue_buffered.pop_front();
|
||||
|
||||
std::vector<vectorized::BlockUPtr> merge_blocks;
|
||||
auto rows = (*block)->rows();
|
||||
while (!_blocks_queue_buffered.empty()) {
|
||||
const auto add_rows = (*_blocks_queue_buffered.front()).rows();
|
||||
if (rows + add_rows < state->batch_size()) {
|
||||
rows += add_rows;
|
||||
merge_blocks.emplace_back(std::move(_blocks_queue_buffered.front()));
|
||||
_blocks_queue_buffered.pop_front();
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (_blocks_queue_buffered.empty()) {
|
||||
std::unique_lock l(_transfer_lock);
|
||||
if (_blocks_queue.empty()) {
|
||||
this->reschedule_scanner_ctx();
|
||||
_dependency->block();
|
||||
} else {
|
||||
_blocks_queue_buffered = std::move(_blocks_queue);
|
||||
}
|
||||
}
|
||||
|
||||
_cur_bytes_in_queue -= (*block)->allocated_bytes();
|
||||
if (!merge_blocks.empty()) {
|
||||
vectorized::MutableBlock m(block->get());
|
||||
for (auto& merge_block : merge_blocks) {
|
||||
_cur_bytes_in_queue -= merge_block->allocated_bytes();
|
||||
static_cast<void>(m.merge(*merge_block));
|
||||
if (merge_block->mem_reuse()) {
|
||||
_free_blocks_buffered.emplace_back(std::move(merge_block));
|
||||
}
|
||||
}
|
||||
(*block)->set_columns(std::move(m.mutable_columns()));
|
||||
}
|
||||
return_free_blocks();
|
||||
|
||||
// after return free blocks, should try to reschedule the scanner
|
||||
if (should_be_scheduled()) {
|
||||
this->reschedule_scanner_ctx();
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
limit_, max_bytes_in_blocks_queue, 1, local_state) {
|
||||
_dependency = dependency;
|
||||
}
|
||||
|
||||
void reschedule_scanner_ctx() override {
|
||||
if (done()) {
|
||||
return;
|
||||
void append_block_to_queue(std::shared_ptr<vectorized::ScanTask> scan_task) override {
|
||||
vectorized::ScannerContext::append_block_to_queue(scan_task);
|
||||
if (_dependency) {
|
||||
_dependency->set_ready();
|
||||
}
|
||||
auto state = _scanner_scheduler->submit(shared_from_this());
|
||||
//todo(wb) rethinking is it better to mark current scan_context failed when submit failed many times?
|
||||
if (state.ok()) {
|
||||
_num_scheduling_ctx++;
|
||||
} else {
|
||||
set_status_on_error(state, false);
|
||||
}
|
||||
|
||||
Status get_block_from_queue(RuntimeState* state, vectorized::Block* block, bool* eos, int id,
|
||||
bool wait = true) override {
|
||||
Status st = vectorized::ScannerContext::get_block_from_queue(state, block, eos, id, wait);
|
||||
std::lock_guard<std::mutex> l(_transfer_lock);
|
||||
if (_blocks_queue.empty()) {
|
||||
if (_dependency) {
|
||||
_dependency->block();
|
||||
}
|
||||
}
|
||||
return st;
|
||||
}
|
||||
|
||||
protected:
|
||||
void _set_scanner_done() override {
|
||||
if (_dependency) {
|
||||
_dependency->set_scanner_done();
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
void return_free_blocks() {
|
||||
if (_free_blocks_buffered.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
size_t total_bytes = 0;
|
||||
for (auto& block : _free_blocks_buffered) {
|
||||
const auto bytes = block->allocated_bytes();
|
||||
block->clear_column_data();
|
||||
_estimated_block_bytes = std::max(bytes, (size_t)16);
|
||||
total_bytes += bytes;
|
||||
}
|
||||
_free_blocks_memory_usage->add(total_bytes);
|
||||
const auto count = _free_blocks_buffered.size();
|
||||
_free_blocks.enqueue_bulk(std::make_move_iterator(_free_blocks_buffered.begin()), count);
|
||||
_free_blocks_buffered.clear();
|
||||
_serving_blocks_num -= count;
|
||||
}
|
||||
|
||||
std::vector<vectorized::BlockUPtr> _free_blocks_buffered;
|
||||
std::list<vectorized::BlockUPtr> _blocks_queue_buffered;
|
||||
std::shared_ptr<pipeline::ScanDependency> _dependency = nullptr;
|
||||
};
|
||||
|
||||
} // namespace doris::pipeline
|
||||
|
||||
@ -17,12 +17,10 @@
|
||||
|
||||
#include "scanner_context.h"
|
||||
|
||||
#include <bthread/bthread.h>
|
||||
#include <fmt/format.h>
|
||||
#include <gen_cpp/Metrics_types.h>
|
||||
#include <glog/logging.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <mutex>
|
||||
#include <ostream>
|
||||
#include <utility>
|
||||
@ -31,64 +29,57 @@
|
||||
#include "common/status.h"
|
||||
#include "pipeline/exec/scan_operator.h"
|
||||
#include "runtime/descriptors.h"
|
||||
#include "runtime/exec_env.h"
|
||||
#include "runtime/query_context.h"
|
||||
#include "runtime/runtime_state.h"
|
||||
#include "util/pretty_printer.h"
|
||||
#include "util/uid_util.h"
|
||||
#include "vec/core/block.h"
|
||||
#include "vec/exec/scan/scanner_scheduler.h"
|
||||
#include "vec/exec/scan/vscan_node.h"
|
||||
#include "vec/exec/scan/vscanner.h"
|
||||
|
||||
namespace doris::vectorized {
|
||||
|
||||
using namespace std::chrono_literals;
|
||||
|
||||
static bvar::Status<int64_t> g_bytes_in_scanner_queue("doris_bytes_in_scanner_queue", 0);
|
||||
static bvar::Status<int64_t> g_num_running_scanners("doris_num_running_scanners", 0);
|
||||
|
||||
ScannerContext::ScannerContext(RuntimeState* state, const TupleDescriptor* output_tuple_desc,
|
||||
const RowDescriptor* output_row_descriptor,
|
||||
const std::list<std::shared_ptr<ScannerDelegate>>& scanners,
|
||||
int64_t limit_, int64_t max_bytes_in_blocks_queue,
|
||||
const int num_parallel_instances,
|
||||
pipeline::ScanLocalStateBase* local_state,
|
||||
std::shared_ptr<pipeline::ScanDependency> dependency)
|
||||
pipeline::ScanLocalStateBase* local_state)
|
||||
: HasTaskExecutionCtx(state),
|
||||
_state(state),
|
||||
_parent(nullptr),
|
||||
_local_state(local_state),
|
||||
_output_tuple_desc(output_row_descriptor
|
||||
? output_row_descriptor->tuple_descriptors().front()
|
||||
: output_tuple_desc),
|
||||
_output_row_descriptor(output_row_descriptor),
|
||||
_process_status(Status::OK()),
|
||||
_batch_size(state->batch_size()),
|
||||
limit(limit_),
|
||||
_max_bytes_in_queue(std::max(max_bytes_in_blocks_queue, (int64_t)1024) *
|
||||
num_parallel_instances),
|
||||
_scanner_scheduler(state->exec_env()->scanner_scheduler()),
|
||||
_scanners(scanners.begin(), scanners.end()),
|
||||
_all_scanners(scanners.begin(), scanners.end()),
|
||||
_num_parallel_instances(num_parallel_instances),
|
||||
_dependency(dependency) {
|
||||
_num_parallel_instances(num_parallel_instances) {
|
||||
DCHECK(_output_row_descriptor == nullptr ||
|
||||
_output_row_descriptor->tuple_descriptors().size() == 1);
|
||||
_query_id = _state->get_query_ctx()->query_id();
|
||||
ctx_id = UniqueId::gen_uid().to_string();
|
||||
if (_scanners.empty()) {
|
||||
// Provide more memory for wide tables, increase proportionally by multiples of 300
|
||||
_max_bytes_in_queue *= _output_tuple_desc->slots().size() / 300 + 1;
|
||||
if (scanners.empty()) {
|
||||
_is_finished = true;
|
||||
_set_scanner_done();
|
||||
}
|
||||
_scanners.enqueue_bulk(scanners.begin(), scanners.size());
|
||||
if (limit < 0) {
|
||||
limit = -1;
|
||||
}
|
||||
_max_thread_num = config::doris_scanner_thread_pool_thread_num / 4;
|
||||
MAX_SCALE_UP_RATIO = _state->scanner_scale_up_ratio();
|
||||
_max_thread_num = _state->num_scanner_threads() > 0
|
||||
? _state->num_scanner_threads()
|
||||
: config::doris_scanner_thread_pool_thread_num /
|
||||
state->query_parallel_instance_num();
|
||||
_max_thread_num *= num_parallel_instances;
|
||||
_max_thread_num = _max_thread_num == 0 ? 1 : _max_thread_num;
|
||||
DCHECK(_max_thread_num > 0);
|
||||
_max_thread_num = std::min(_max_thread_num, (int32_t)_scanners.size());
|
||||
_max_thread_num = std::min(_max_thread_num, (int32_t)scanners.size());
|
||||
// 1. Calculate max concurrency
|
||||
// For select * from table limit 10; should just use one thread.
|
||||
if ((_parent && _parent->should_run_serial()) ||
|
||||
@ -104,45 +95,9 @@ ScannerContext::ScannerContext(doris::RuntimeState* state, doris::vectorized::VS
|
||||
int64_t limit_, int64_t max_bytes_in_blocks_queue,
|
||||
const int num_parallel_instances,
|
||||
pipeline::ScanLocalStateBase* local_state)
|
||||
: HasTaskExecutionCtx(state),
|
||||
_state(state),
|
||||
_parent(parent),
|
||||
_local_state(local_state),
|
||||
_output_tuple_desc(output_row_descriptor
|
||||
? output_row_descriptor->tuple_descriptors().front()
|
||||
: output_tuple_desc),
|
||||
_output_row_descriptor(output_row_descriptor),
|
||||
_process_status(Status::OK()),
|
||||
_batch_size(state->batch_size()),
|
||||
limit(limit_),
|
||||
_max_bytes_in_queue(std::max(max_bytes_in_blocks_queue, (int64_t)1024) *
|
||||
num_parallel_instances),
|
||||
_scanner_scheduler(state->exec_env()->scanner_scheduler()),
|
||||
_scanners(scanners.begin(), scanners.end()),
|
||||
_all_scanners(scanners.begin(), scanners.end()),
|
||||
_num_parallel_instances(num_parallel_instances) {
|
||||
DCHECK(_output_row_descriptor == nullptr ||
|
||||
_output_row_descriptor->tuple_descriptors().size() == 1);
|
||||
_query_id = _state->get_query_ctx()->query_id();
|
||||
ctx_id = UniqueId::gen_uid().to_string();
|
||||
if (_scanners.empty()) {
|
||||
_is_finished = true;
|
||||
_set_scanner_done();
|
||||
}
|
||||
if (limit < 0) {
|
||||
limit = -1;
|
||||
}
|
||||
_max_thread_num = config::doris_scanner_thread_pool_thread_num / 4;
|
||||
_max_thread_num *= num_parallel_instances;
|
||||
_max_thread_num = _max_thread_num == 0 ? 1 : _max_thread_num;
|
||||
DCHECK(_max_thread_num > 0);
|
||||
_max_thread_num = std::min(_max_thread_num, (int32_t)_scanners.size());
|
||||
// 1. Calculate max concurrency
|
||||
// For select * from table limit 10; should just use one thread.
|
||||
if ((_parent && _parent->should_run_serial()) ||
|
||||
(_local_state && _local_state->should_run_serial())) {
|
||||
_max_thread_num = 1;
|
||||
}
|
||||
: ScannerContext(state, output_tuple_desc, output_row_descriptor, scanners, limit_,
|
||||
max_bytes_in_blocks_queue, num_parallel_instances, local_state) {
|
||||
_parent = parent;
|
||||
}
|
||||
|
||||
// After init function call, should not access _parent
|
||||
@ -150,43 +105,21 @@ Status ScannerContext::init() {
|
||||
if (_parent) {
|
||||
_scanner_profile = _parent->_scanner_profile;
|
||||
_scanner_sched_counter = _parent->_scanner_sched_counter;
|
||||
_scanner_ctx_sched_counter = _parent->_scanner_ctx_sched_counter;
|
||||
_scanner_ctx_sched_time = _parent->_scanner_ctx_sched_time;
|
||||
_free_blocks_memory_usage = _parent->_free_blocks_memory_usage;
|
||||
_newly_create_free_blocks_num = _parent->_newly_create_free_blocks_num;
|
||||
_queued_blocks_memory_usage = _parent->_queued_blocks_memory_usage;
|
||||
_scanner_wait_batch_timer = _parent->_scanner_wait_batch_timer;
|
||||
_free_blocks_memory_usage_mark = _parent->_free_blocks_memory_usage;
|
||||
_scanner_ctx_sched_time = _parent->_scanner_ctx_sched_time;
|
||||
_scale_up_scanners_counter = _parent->_scale_up_scanners_counter;
|
||||
} else {
|
||||
_scanner_profile = _local_state->_scanner_profile;
|
||||
_scanner_sched_counter = _local_state->_scanner_sched_counter;
|
||||
_scanner_ctx_sched_counter = _local_state->_scanner_ctx_sched_counter;
|
||||
_scanner_ctx_sched_time = _local_state->_scanner_ctx_sched_time;
|
||||
_free_blocks_memory_usage = _local_state->_free_blocks_memory_usage;
|
||||
_newly_create_free_blocks_num = _local_state->_newly_create_free_blocks_num;
|
||||
_queued_blocks_memory_usage = _local_state->_queued_blocks_memory_usage;
|
||||
_scanner_wait_batch_timer = _local_state->_scanner_wait_batch_timer;
|
||||
_free_blocks_memory_usage_mark = _local_state->_free_blocks_memory_usage;
|
||||
_scanner_ctx_sched_time = _local_state->_scanner_ctx_sched_time;
|
||||
_scale_up_scanners_counter = _local_state->_scale_up_scanners_counter;
|
||||
}
|
||||
|
||||
// 2. Calculate the number of free blocks that all scanners can use.
|
||||
// The calculation logic is as follows:
|
||||
// 1. Assuming that at most M rows can be scanned in one scan(config::doris_scanner_row_num),
|
||||
// then figure out how many blocks are required for one scan(_block_per_scanner).
|
||||
// 2. The maximum number of concurrency * the blocks required for one scan,
|
||||
// that is, the number of blocks that all scanners can use.
|
||||
auto doris_scanner_row_num =
|
||||
limit == -1 ? config::doris_scanner_row_num
|
||||
: std::min(static_cast<int64_t>(config::doris_scanner_row_num), limit);
|
||||
int real_block_size =
|
||||
limit == -1 ? _batch_size : std::min(static_cast<int64_t>(_batch_size), limit);
|
||||
_block_per_scanner = (doris_scanner_row_num + (real_block_size - 1)) / real_block_size;
|
||||
_free_blocks_capacity = _max_thread_num * _block_per_scanner;
|
||||
auto block = get_free_block();
|
||||
_estimated_block_bytes = std::max(block->allocated_bytes(), (size_t)16);
|
||||
int min_blocks = (config::min_bytes_in_scanner_queue + _estimated_block_bytes - 1) /
|
||||
_estimated_block_bytes;
|
||||
_free_blocks_capacity = std::max(_free_blocks_capacity, min_blocks);
|
||||
return_free_block(std::move(block));
|
||||
|
||||
#ifndef BE_TEST
|
||||
// 3. get thread token
|
||||
if (_state->get_query_ctx()) {
|
||||
@ -198,8 +131,6 @@ Status ScannerContext::init() {
|
||||
}
|
||||
#endif
|
||||
|
||||
_num_unfinished_scanners = _scanners.size();
|
||||
|
||||
if (_parent) {
|
||||
COUNTER_SET(_parent->_max_scanner_thread_num, (int64_t)_max_thread_num);
|
||||
_parent->_runtime_profile->add_info_string("UseSpecificThreadToken",
|
||||
@ -210,6 +141,17 @@ Status ScannerContext::init() {
|
||||
thread_token == nullptr ? "False" : "True");
|
||||
}
|
||||
|
||||
// submit `_max_thread_num` running scanners to `ScannerScheduler`
|
||||
// When a running scanners is finished, it will submit one of the remaining scanners.
|
||||
for (int i = 0; i < _max_thread_num; ++i) {
|
||||
std::weak_ptr<ScannerDelegate> next_scanner;
|
||||
if (_scanners.try_dequeue(next_scanner)) {
|
||||
vectorized::BlockUPtr block = get_free_block();
|
||||
submit_scan_task(std::make_shared<ScanTask>(next_scanner, std::move(block)));
|
||||
_num_running_scanners++;
|
||||
}
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
@ -220,140 +162,223 @@ std::string ScannerContext::parent_name() {
|
||||
vectorized::BlockUPtr ScannerContext::get_free_block() {
|
||||
vectorized::BlockUPtr block;
|
||||
if (_free_blocks.try_dequeue(block)) {
|
||||
std::lock_guard<std::mutex> fl(_free_blocks_lock);
|
||||
DCHECK(block->mem_reuse());
|
||||
_free_blocks_memory_usage->add(-block->allocated_bytes());
|
||||
_serving_blocks_num++;
|
||||
_free_blocks_memory_usage -= block->allocated_bytes();
|
||||
_free_blocks_memory_usage_mark->set(_free_blocks_memory_usage);
|
||||
return block;
|
||||
}
|
||||
|
||||
block = vectorized::Block::create_unique(_output_tuple_desc->slots(), _batch_size,
|
||||
true /*ignore invalid slots*/);
|
||||
|
||||
COUNTER_UPDATE(_newly_create_free_blocks_num, 1);
|
||||
|
||||
_serving_blocks_num++;
|
||||
return block;
|
||||
_newly_create_free_blocks_num->update(1);
|
||||
return vectorized::Block::create_unique(_output_tuple_desc->slots(), _batch_size,
|
||||
true /*ignore invalid slots*/);
|
||||
}
|
||||
|
||||
void ScannerContext::return_free_block(std::unique_ptr<vectorized::Block> block) {
|
||||
_serving_blocks_num--;
|
||||
if (block->mem_reuse()) {
|
||||
// Only put blocks with schema to free blocks, because colocate blocks
|
||||
// need schema.
|
||||
_estimated_block_bytes = std::max(block->allocated_bytes(), (size_t)16);
|
||||
void ScannerContext::return_free_block(vectorized::BlockUPtr block) {
|
||||
std::lock_guard<std::mutex> fl(_free_blocks_lock);
|
||||
if (block->mem_reuse() && _free_blocks_memory_usage < _max_bytes_in_queue) {
|
||||
block->clear_column_data();
|
||||
_free_blocks_memory_usage->add(block->allocated_bytes());
|
||||
_free_blocks_memory_usage += block->allocated_bytes();
|
||||
_free_blocks_memory_usage_mark->set(_free_blocks_memory_usage);
|
||||
_free_blocks.enqueue(std::move(block));
|
||||
}
|
||||
}
|
||||
|
||||
void ScannerContext::append_blocks_to_queue(std::vector<vectorized::BlockUPtr>& blocks) {
|
||||
std::lock_guard l(_transfer_lock);
|
||||
auto old_bytes_in_queue = _cur_bytes_in_queue;
|
||||
for (auto& b : blocks) {
|
||||
auto st = validate_block_schema(b.get());
|
||||
if (!st.ok()) {
|
||||
set_status_on_error(st, false);
|
||||
}
|
||||
_cur_bytes_in_queue += b->allocated_bytes();
|
||||
_blocks_queue.push_back(std::move(b));
|
||||
}
|
||||
blocks.clear();
|
||||
if (_dependency) {
|
||||
_dependency->set_ready();
|
||||
}
|
||||
_blocks_queue_added_cv.notify_one();
|
||||
_queued_blocks_memory_usage->add(_cur_bytes_in_queue - old_bytes_in_queue);
|
||||
g_bytes_in_scanner_queue.set_value(_cur_bytes_in_queue);
|
||||
}
|
||||
|
||||
bool ScannerContext::empty_in_queue(int id) {
|
||||
std::unique_lock l(_transfer_lock);
|
||||
std::lock_guard<std::mutex> l(_transfer_lock);
|
||||
return _blocks_queue.empty();
|
||||
}
|
||||
|
||||
Status ScannerContext::get_block_from_queue(RuntimeState* state, vectorized::BlockUPtr* block,
|
||||
bool* eos, int id) {
|
||||
std::vector<vectorized::BlockUPtr> merge_blocks;
|
||||
{
|
||||
std::unique_lock l(_transfer_lock);
|
||||
// Normally, the scanner scheduler will schedule ctx.
|
||||
// But when the amount of data in the blocks queue exceeds the upper limit,
|
||||
// the scheduler will stop scheduling.
|
||||
// (if the scheduler continues to schedule, it will cause a lot of busy running).
|
||||
// At this point, consumers are required to trigger new scheduling to ensure that
|
||||
// data can be continuously fetched.
|
||||
bool to_be_schedule = should_be_scheduled();
|
||||
void ScannerContext::submit_scan_task(std::shared_ptr<ScanTask> scan_task) {
|
||||
_scanner_sched_counter->update(1);
|
||||
_num_scheduled_scanners++;
|
||||
_scanner_scheduler->submit(shared_from_this(), scan_task);
|
||||
}
|
||||
|
||||
bool is_scheduled = false;
|
||||
if (!done() && to_be_schedule && _num_running_scanners == 0) {
|
||||
is_scheduled = true;
|
||||
auto submit_status = _scanner_scheduler->submit(shared_from_this());
|
||||
if (!submit_status.ok()) {
|
||||
set_status_on_error(submit_status, false);
|
||||
void ScannerContext::append_block_to_queue(std::shared_ptr<ScanTask> scan_task) {
|
||||
if (scan_task->status_ok() && scan_task->current_block->rows() > 0) {
|
||||
Status st = validate_block_schema(scan_task->current_block.get());
|
||||
if (!st.ok()) {
|
||||
scan_task->set_status(st);
|
||||
}
|
||||
}
|
||||
std::lock_guard<std::mutex> l(_transfer_lock);
|
||||
if (!scan_task->status_ok()) {
|
||||
_process_status = scan_task->get_status();
|
||||
}
|
||||
if (_last_scale_up_time == 0) {
|
||||
_last_scale_up_time = UnixMillis();
|
||||
}
|
||||
if (_blocks_queue.empty() && _last_fetch_time != 0) {
|
||||
// there's no block in queue before current block, so the consumer is waiting
|
||||
_total_wait_block_time += UnixMillis() - _last_fetch_time;
|
||||
}
|
||||
_num_scheduled_scanners--;
|
||||
_blocks_queue.emplace_back(scan_task);
|
||||
_blocks_queue_added_cv.notify_one();
|
||||
}
|
||||
|
||||
Status ScannerContext::get_block_from_queue(RuntimeState* state, vectorized::Block* block,
|
||||
bool* eos, int id, bool wait) {
|
||||
if (state->is_cancelled()) {
|
||||
_set_scanner_done();
|
||||
return Status::Cancelled("Query cancelled in ScannerContext");
|
||||
}
|
||||
std::unique_lock l(_transfer_lock);
|
||||
// Wait for block from queue
|
||||
if (wait) {
|
||||
// scanner batch wait time
|
||||
SCOPED_TIMER(_scanner_wait_batch_timer);
|
||||
while (!done() && _blocks_queue.empty() && _process_status.ok()) {
|
||||
_blocks_queue_added_cv.wait_for(l, 1s);
|
||||
}
|
||||
}
|
||||
if (!_process_status.ok()) {
|
||||
_set_scanner_done();
|
||||
return _process_status;
|
||||
}
|
||||
std::shared_ptr<ScanTask> scan_task = nullptr;
|
||||
if (!_blocks_queue.empty() && !done()) {
|
||||
_last_fetch_time = UnixMillis();
|
||||
scan_task = _blocks_queue.front();
|
||||
_blocks_queue.pop_front();
|
||||
}
|
||||
|
||||
if (scan_task) {
|
||||
if (!scan_task->status_ok()) {
|
||||
_set_scanner_done();
|
||||
return scan_task->get_status();
|
||||
}
|
||||
// We can only know the block size after reading at least one block
|
||||
// Just take the size of first block as `_estimated_block_size`
|
||||
if (scan_task->first_block) {
|
||||
std::lock_guard<std::mutex> fl(_free_blocks_lock);
|
||||
size_t block_size = scan_task->current_block->allocated_bytes();
|
||||
_free_blocks_memory_usage += block_size;
|
||||
_free_blocks_memory_usage_mark->set(_free_blocks_memory_usage);
|
||||
scan_task->first_block = false;
|
||||
if (block_size > _estimated_block_size) {
|
||||
_estimated_block_size = block_size;
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for block from queue
|
||||
{
|
||||
SCOPED_TIMER(_scanner_wait_batch_timer);
|
||||
// scanner batch wait time
|
||||
while (!(!_blocks_queue.empty() || done() || !status().ok() || state->is_cancelled())) {
|
||||
if (!is_scheduled && _num_running_scanners == 0 && should_be_scheduled()) {
|
||||
LOG(INFO) << debug_string();
|
||||
}
|
||||
_blocks_queue_added_cv.wait_for(l, 1s);
|
||||
}
|
||||
// consume current block
|
||||
block->swap(*scan_task->current_block);
|
||||
if (!scan_task->current_block->mem_reuse()) {
|
||||
// it depends on the memory strategy of ScanNode/ScanOperator
|
||||
// we should double check `mem_reuse()` of `current_block` to make sure it can be reused
|
||||
_newly_create_free_blocks_num->update(1);
|
||||
scan_task->current_block = vectorized::Block::create_unique(_output_tuple_desc->slots(),
|
||||
_batch_size, true);
|
||||
}
|
||||
|
||||
if (state->is_cancelled()) {
|
||||
set_status_on_error(Status::Cancelled("cancelled"), false);
|
||||
}
|
||||
|
||||
if (!status().ok()) {
|
||||
return status();
|
||||
}
|
||||
|
||||
if (!_blocks_queue.empty()) {
|
||||
*block = std::move(_blocks_queue.front());
|
||||
_blocks_queue.pop_front();
|
||||
auto block_bytes = (*block)->allocated_bytes();
|
||||
_cur_bytes_in_queue -= block_bytes;
|
||||
_queued_blocks_memory_usage->add(-block_bytes);
|
||||
|
||||
auto rows = (*block)->rows();
|
||||
while (!_blocks_queue.empty()) {
|
||||
auto& add_block = _blocks_queue.front();
|
||||
const auto add_rows = (*add_block).rows();
|
||||
if (rows + add_rows < state->batch_size()) {
|
||||
rows += add_rows;
|
||||
block_bytes = (*add_block).allocated_bytes();
|
||||
_cur_bytes_in_queue -= block_bytes;
|
||||
_queued_blocks_memory_usage->add(-block_bytes);
|
||||
merge_blocks.emplace_back(std::move(add_block));
|
||||
_blocks_queue.pop_front();
|
||||
} else {
|
||||
break;
|
||||
if (scan_task->is_eos()) { // current scanner is finished, and no more data to read
|
||||
_num_finished_scanners++;
|
||||
std::weak_ptr<ScannerDelegate> next_scanner;
|
||||
// submit one of the remaining scanners
|
||||
if (_scanners.try_dequeue(next_scanner)) {
|
||||
// reuse current running scanner, just reset some states.
|
||||
scan_task->reuse_scanner(next_scanner);
|
||||
submit_scan_task(scan_task);
|
||||
} else {
|
||||
// no more scanner to be scheduled
|
||||
// `_free_blocks` serve all running scanners, maybe it's too large for the remaining scanners
|
||||
int free_blocks_for_each = _free_blocks.size_approx() / _num_running_scanners;
|
||||
_num_running_scanners--;
|
||||
std::lock_guard<std::mutex> fl(_free_blocks_lock);
|
||||
for (int i = 0; i < free_blocks_for_each; ++i) {
|
||||
vectorized::BlockUPtr removed_block;
|
||||
if (_free_blocks.try_dequeue(removed_block)) {
|
||||
_free_blocks_memory_usage -= block->allocated_bytes();
|
||||
_free_blocks_memory_usage_mark->set(_free_blocks_memory_usage);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
*eos = done();
|
||||
// resubmit current running scanner to read the next block
|
||||
submit_scan_task(scan_task);
|
||||
}
|
||||
// scale up
|
||||
_try_to_scale_up();
|
||||
}
|
||||
|
||||
g_bytes_in_scanner_queue.set_value(_cur_bytes_in_queue);
|
||||
if (!merge_blocks.empty()) {
|
||||
vectorized::MutableBlock m(block->get());
|
||||
for (auto& merge_block : merge_blocks) {
|
||||
static_cast<void>(m.merge(*merge_block));
|
||||
return_free_block(std::move(merge_block));
|
||||
}
|
||||
(*block)->set_columns(std::move(m.mutable_columns()));
|
||||
if (_num_finished_scanners == _all_scanners.size() && _blocks_queue.empty()) {
|
||||
_set_scanner_done();
|
||||
_is_finished = true;
|
||||
}
|
||||
|
||||
*eos = done();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void ScannerContext::_try_to_scale_up() {
|
||||
// Four criteria to determine whether to increase the parallelism of the scanners
|
||||
// 1. It ran for at least `SCALE_UP_DURATION` ms after last scale up
|
||||
// 2. Half(`WAIT_BLOCK_DURATION_RATIO`) of the duration is waiting to get blocks
|
||||
// 3. `_free_blocks_memory_usage` < `_max_bytes_in_queue`, remains enough memory to scale up
|
||||
// 4. At most scale up `MAX_SCALE_UP_RATIO` times to `_max_thread_num`
|
||||
if (MAX_SCALE_UP_RATIO > 0 && _scanners.size_approx() > 0 &&
|
||||
(_num_running_scanners < _max_thread_num * MAX_SCALE_UP_RATIO) &&
|
||||
(_last_fetch_time - _last_scale_up_time > SCALE_UP_DURATION) && // duration > 5000ms
|
||||
(_total_wait_block_time > (_last_fetch_time - _last_scale_up_time) *
|
||||
WAIT_BLOCK_DURATION_RATIO)) { // too large lock time
|
||||
double wait_ratio =
|
||||
(double)_total_wait_block_time / (_last_fetch_time - _last_scale_up_time);
|
||||
if (_last_wait_duration_ratio > 0 && wait_ratio > _last_wait_duration_ratio * 0.8) {
|
||||
// when _last_wait_duration_ratio > 0, it has scaled up before.
|
||||
// we need to determine if the scale-up is effective:
|
||||
// the wait duration ratio after last scaling up should less than 80% of `_last_wait_duration_ratio`
|
||||
return;
|
||||
}
|
||||
|
||||
std::lock_guard<std::mutex> fl(_free_blocks_lock);
|
||||
bool is_scale_up = false;
|
||||
// calculate the number of scanners that can be scheduled
|
||||
int num_add = std::min(_num_running_scanners * SCALE_UP_RATIO,
|
||||
_max_thread_num * MAX_SCALE_UP_RATIO - _num_running_scanners);
|
||||
num_add = std::max(num_add, 1);
|
||||
for (int i = 0; i < num_add; ++i) {
|
||||
vectorized::BlockUPtr allocate_block = nullptr;
|
||||
// reuse block in `_free_blocks` firstly
|
||||
if (!_free_blocks.try_dequeue(allocate_block)) {
|
||||
if (_free_blocks_memory_usage < _max_bytes_in_queue) {
|
||||
_newly_create_free_blocks_num->update(1);
|
||||
allocate_block = vectorized::Block::create_unique(_output_tuple_desc->slots(),
|
||||
_batch_size, true);
|
||||
}
|
||||
} else {
|
||||
// comes from `_free_blocks`, decrease first, then will be added back.
|
||||
_free_blocks_memory_usage -= allocate_block->allocated_bytes();
|
||||
_free_blocks_memory_usage_mark->set(_free_blocks_memory_usage);
|
||||
}
|
||||
if (allocate_block) {
|
||||
// get enough memory to launch one more scanner.
|
||||
std::weak_ptr<ScannerDelegate> scale_up_scanner;
|
||||
if (_scanners.try_dequeue(scale_up_scanner)) {
|
||||
std::shared_ptr<ScanTask> scale_up_task =
|
||||
std::make_shared<ScanTask>(scale_up_scanner, std::move(allocate_block));
|
||||
_free_blocks_memory_usage += _estimated_block_size;
|
||||
_free_blocks_memory_usage_mark->set(_free_blocks_memory_usage);
|
||||
// `first_block` is used to update `_free_blocks_memory_usage`,
|
||||
// we have got the `_estimated_block_size`, no need for further updates
|
||||
scale_up_task->first_block = false;
|
||||
submit_scan_task(scale_up_task);
|
||||
_num_running_scanners++;
|
||||
_scale_up_scanners_counter->update(1);
|
||||
is_scale_up = true;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (is_scale_up) {
|
||||
_last_wait_duration_ratio = wait_ratio;
|
||||
_last_scale_up_time = UnixMillis();
|
||||
_total_wait_block_time = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Status ScannerContext::validate_block_schema(Block* block) {
|
||||
size_t index = 0;
|
||||
for (auto& slot : _output_tuple_desc->slots()) {
|
||||
@ -380,29 +405,17 @@ Status ScannerContext::validate_block_schema(Block* block) {
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
void ScannerContext::inc_num_running_scanners(int32_t inc) {
|
||||
std::lock_guard l(_transfer_lock);
|
||||
_num_running_scanners += inc;
|
||||
g_num_running_scanners.set_value(_num_running_scanners);
|
||||
}
|
||||
|
||||
void ScannerContext::set_status_on_error(const Status& status, bool need_lock) {
|
||||
std::unique_lock l(_transfer_lock, std::defer_lock);
|
||||
if (need_lock) {
|
||||
l.lock();
|
||||
}
|
||||
if (this->status().ok()) {
|
||||
_process_status = status;
|
||||
_blocks_queue_added_cv.notify_one();
|
||||
_should_stop = true;
|
||||
_set_scanner_done();
|
||||
LOG(INFO) << "ctx is set status on error " << debug_string()
|
||||
<< ", call stack is: " << Status::InternalError<true>("catch error status");
|
||||
}
|
||||
void ScannerContext::set_status_on_error(const Status& status) {
|
||||
std::lock_guard<std::mutex> l(_transfer_lock);
|
||||
_process_status = status;
|
||||
_blocks_queue_added_cv.notify_one();
|
||||
}
|
||||
|
||||
void ScannerContext::stop_scanners(RuntimeState* state) {
|
||||
std::unique_lock l(_transfer_lock);
|
||||
std::lock_guard<std::mutex> l(_transfer_lock);
|
||||
if (_should_stop) {
|
||||
return;
|
||||
}
|
||||
_should_stop = true;
|
||||
_set_scanner_done();
|
||||
for (const std::weak_ptr<ScannerDelegate>& scanner : _all_scanners) {
|
||||
@ -453,95 +466,15 @@ void ScannerContext::stop_scanners(RuntimeState* state) {
|
||||
_blocks_queue_added_cv.notify_one();
|
||||
}
|
||||
|
||||
void ScannerContext::_set_scanner_done() {
|
||||
if (_dependency) {
|
||||
_dependency->set_scanner_done();
|
||||
}
|
||||
}
|
||||
|
||||
std::string ScannerContext::debug_string() {
|
||||
return fmt::format(
|
||||
"id: {}, total scanners: {}, scanners: {}, blocks in queue: {},"
|
||||
" status: {}, _should_stop: {}, _is_finished: {}, free blocks: {},"
|
||||
"id: {}, total scanners: {}, blocks in queue: {},"
|
||||
" _should_stop: {}, _is_finished: {}, free blocks: {},"
|
||||
" limit: {}, _num_running_scanners: {}, _max_thread_num: {},"
|
||||
" _block_per_scanner: {}, _cur_bytes_in_queue: {}, MAX_BYTE_OF_QUEUE: {}, "
|
||||
"num_ctx_scheduled: {}, serving_blocks_num: {}, allowed_blocks_num: {}, query_id: {}",
|
||||
ctx_id, _all_scanners.size(), _scanners.size(), _blocks_queue.size(),
|
||||
_process_status.to_string(), _should_stop, _is_finished, _free_blocks.size_approx(),
|
||||
limit, _num_running_scanners, _max_thread_num, _block_per_scanner, _cur_bytes_in_queue,
|
||||
_max_bytes_in_queue, num_ctx_scheduled(), _serving_blocks_num, allowed_blocks_num(),
|
||||
print_id(_query_id));
|
||||
}
|
||||
|
||||
void ScannerContext::reschedule_scanner_ctx() {
|
||||
std::lock_guard l(_transfer_lock);
|
||||
if (done()) {
|
||||
return;
|
||||
}
|
||||
auto submit_status = _scanner_scheduler->submit(shared_from_this());
|
||||
//todo(wb) rethinking is it better to mark current scan_context failed when submit failed many times?
|
||||
if (!submit_status.ok()) {
|
||||
set_status_on_error(submit_status, false);
|
||||
}
|
||||
}
|
||||
|
||||
void ScannerContext::push_back_scanner_and_reschedule(std::shared_ptr<ScannerDelegate> scanner) {
|
||||
std::lock_guard l(_transfer_lock);
|
||||
// Use a transfer lock to avoid the scanner be scheduled concurrently. For example, that after
|
||||
// calling "_scanners.push_front(scanner)", there may be other ctx in scheduler
|
||||
// to schedule that scanner right away, and in that schedule run, the scanner may be marked as closed
|
||||
// before we call the following if() block.
|
||||
{
|
||||
--_num_running_scanners;
|
||||
g_num_running_scanners.set_value(_num_running_scanners);
|
||||
if (scanner->_scanner->need_to_close()) {
|
||||
--_num_unfinished_scanners;
|
||||
if (_num_unfinished_scanners == 0) {
|
||||
_is_finished = true;
|
||||
_set_scanner_done();
|
||||
_blocks_queue_added_cv.notify_one();
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
_scanners.push_front(scanner);
|
||||
}
|
||||
}
|
||||
|
||||
if (should_be_scheduled()) {
|
||||
auto submit_status = _scanner_scheduler->submit(shared_from_this());
|
||||
if (!submit_status.ok()) {
|
||||
set_status_on_error(submit_status, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// This method is called in scanner scheduler, and task context is hold
|
||||
void ScannerContext::get_next_batch_of_scanners(
|
||||
std::list<std::weak_ptr<ScannerDelegate>>* current_run) {
|
||||
std::lock_guard l(_transfer_lock);
|
||||
// Update the sched counter for profile
|
||||
Defer defer {[&]() { _scanner_sched_counter->update(current_run->size()); }};
|
||||
// 1. Calculate how many scanners should be scheduled at this run.
|
||||
// If there are enough space in blocks queue,
|
||||
// the scanner number depends on the _free_blocks numbers
|
||||
int thread_slot_num = get_available_thread_slot_num();
|
||||
|
||||
// 2. get #thread_slot_num scanners from ctx->scanners
|
||||
// and put them into "this_run".
|
||||
for (int i = 0; i < thread_slot_num && !_scanners.empty();) {
|
||||
std::weak_ptr<ScannerDelegate> scanner_ref = _scanners.front();
|
||||
std::shared_ptr<ScannerDelegate> scanner = scanner_ref.lock();
|
||||
_scanners.pop_front();
|
||||
if (scanner == nullptr) {
|
||||
continue;
|
||||
}
|
||||
if (scanner->_scanner->need_to_close()) {
|
||||
static_cast<void>(scanner->_scanner->close(_state));
|
||||
} else {
|
||||
current_run->push_back(scanner_ref);
|
||||
i++;
|
||||
}
|
||||
}
|
||||
" _max_bytes_in_queue: {}, query_id: {}",
|
||||
ctx_id, _all_scanners.size(), _blocks_queue.size(), _should_stop, _is_finished,
|
||||
_free_blocks.size_approx(), limit, _num_scheduled_scanners, _max_thread_num,
|
||||
_max_bytes_in_queue, print_id(_query_id));
|
||||
}
|
||||
|
||||
} // namespace doris::vectorized
|
||||
|
||||
@ -58,6 +58,47 @@ class VScanNode;
|
||||
class ScannerScheduler;
|
||||
class SimplifiedScanScheduler;
|
||||
|
||||
class ScanTask {
|
||||
public:
|
||||
ScanTask(std::weak_ptr<ScannerDelegate> delegate_scanner, vectorized::BlockUPtr free_block)
|
||||
: scanner(delegate_scanner), current_block(std::move(free_block)) {}
|
||||
|
||||
private:
|
||||
// whether current scanner is finished
|
||||
bool eos = false;
|
||||
Status status = Status::OK();
|
||||
|
||||
public:
|
||||
std::weak_ptr<ScannerDelegate> scanner;
|
||||
// cache the block of current loop
|
||||
vectorized::BlockUPtr current_block;
|
||||
// only take the size of the first block as estimated size
|
||||
bool first_block = true;
|
||||
uint64_t last_submit_time; // nanoseconds
|
||||
|
||||
void set_status(Status _status) {
|
||||
if (_status.is<ErrorCode::END_OF_FILE>()) {
|
||||
// set `eos` if `END_OF_FILE`, don't take `END_OF_FILE` as error
|
||||
eos = true;
|
||||
}
|
||||
status = _status;
|
||||
}
|
||||
Status get_status() const { return status; }
|
||||
bool status_ok() { return status.ok() || status.is<ErrorCode::END_OF_FILE>(); }
|
||||
bool is_eos() const { return eos; }
|
||||
void set_eos(bool _eos) { eos = _eos; }
|
||||
|
||||
// reuse current running scanner
|
||||
// reset `eos` and `status`
|
||||
// `first_block` is used to update `_free_blocks_memory_usage`, and take the first block size
|
||||
// as the `_estimated_block_size`. It has updated `_free_blocks_memory_usage`, so don't reset.
|
||||
void reuse_scanner(std::weak_ptr<ScannerDelegate> next_scanner) {
|
||||
scanner = next_scanner;
|
||||
eos = false;
|
||||
status = Status::OK();
|
||||
}
|
||||
};
|
||||
|
||||
// ScannerContext is responsible for recording the execution status
|
||||
// of a group of Scanners corresponding to a ScanNode.
|
||||
// Including how many scanners are being scheduled, and maintaining
|
||||
@ -81,88 +122,49 @@ public:
|
||||
virtual Status init();
|
||||
|
||||
vectorized::BlockUPtr get_free_block();
|
||||
void return_free_block(std::unique_ptr<vectorized::Block> block);
|
||||
void return_free_block(vectorized::BlockUPtr block);
|
||||
|
||||
// Append blocks from scanners to the blocks queue.
|
||||
virtual void append_blocks_to_queue(std::vector<vectorized::BlockUPtr>& blocks);
|
||||
// Get next block from blocks queue. Called by ScanNode
|
||||
// Get next block from blocks queue. Called by ScanNode/ScanOperator
|
||||
// Set eos to true if there is no more data to read.
|
||||
// And if eos is true, the block returned must be nullptr.
|
||||
virtual Status get_block_from_queue(RuntimeState* state, vectorized::BlockUPtr* block,
|
||||
bool* eos, int id);
|
||||
virtual Status get_block_from_queue(RuntimeState* state, vectorized::Block* block, bool* eos,
|
||||
int id, bool wait = true);
|
||||
|
||||
[[nodiscard]] Status validate_block_schema(Block* block);
|
||||
|
||||
// When a scanner complete a scan, this method will be called
|
||||
// to return the scanner to the list for next scheduling.
|
||||
void push_back_scanner_and_reschedule(std::shared_ptr<ScannerDelegate> scanner);
|
||||
// submit the running scanner to thread pool in `ScannerScheduler`
|
||||
// set the next scanned block to `ScanTask::current_block`
|
||||
// set the error state to `ScanTask::status`
|
||||
// set the `eos` to `ScanTask::eos` if there is no more data in current scanner
|
||||
void submit_scan_task(std::shared_ptr<ScanTask> scan_task);
|
||||
|
||||
void set_status_on_error(const Status& status, bool need_lock = true);
|
||||
// append the running scanner and its cached block to `_blocks_queue`
|
||||
virtual void append_block_to_queue(std::shared_ptr<ScanTask> scan_task);
|
||||
|
||||
Status status() {
|
||||
if (_process_status.is<ErrorCode::END_OF_FILE>()) {
|
||||
return Status::OK();
|
||||
}
|
||||
return _process_status;
|
||||
}
|
||||
void set_status_on_error(const Status& status);
|
||||
|
||||
// Return true if this ScannerContext need no more process
|
||||
bool done() const { return _is_finished || _should_stop; }
|
||||
bool is_finished() { return _is_finished.load(); }
|
||||
bool should_stop() { return _should_stop.load(); }
|
||||
|
||||
void inc_num_running_scanners(int32_t scanner_inc);
|
||||
|
||||
int get_num_running_scanners() const { return _num_running_scanners; }
|
||||
|
||||
int get_num_unfinished_scanners() const { return _num_unfinished_scanners; }
|
||||
|
||||
void get_next_batch_of_scanners(std::list<std::weak_ptr<ScannerDelegate>>* current_run);
|
||||
|
||||
virtual std::string debug_string();
|
||||
|
||||
RuntimeState* state() { return _state; }
|
||||
|
||||
void incr_num_ctx_scheduling(int64_t num) { _scanner_ctx_sched_counter->update(num); }
|
||||
|
||||
int64_t num_ctx_scheduled() { return _scanner_ctx_sched_counter->value(); }
|
||||
void incr_ctx_scheduling_time(int64_t num) { _scanner_ctx_sched_time->update(num); }
|
||||
|
||||
std::string parent_name();
|
||||
|
||||
virtual bool empty_in_queue(int id);
|
||||
|
||||
// todo(wb) rethinking how to calculate ```_max_bytes_in_queue``` when executing shared scan
|
||||
inline bool should_be_scheduled() const {
|
||||
return (_cur_bytes_in_queue < _max_bytes_in_queue / 2) &&
|
||||
(_serving_blocks_num < allowed_blocks_num());
|
||||
}
|
||||
|
||||
int get_available_thread_slot_num() {
|
||||
int thread_slot_num = 0;
|
||||
thread_slot_num = (allowed_blocks_num() + _block_per_scanner - 1) / _block_per_scanner;
|
||||
thread_slot_num = std::min(thread_slot_num, _max_thread_num - _num_running_scanners);
|
||||
if (thread_slot_num <= 0) {
|
||||
thread_slot_num = 1;
|
||||
}
|
||||
return thread_slot_num;
|
||||
}
|
||||
|
||||
int32_t allowed_blocks_num() const {
|
||||
int32_t blocks_num = std::min(_free_blocks_capacity,
|
||||
int32_t((_max_bytes_in_queue + _estimated_block_bytes - 1) /
|
||||
_estimated_block_bytes));
|
||||
return blocks_num;
|
||||
}
|
||||
|
||||
SimplifiedScanScheduler* get_simple_scan_scheduler() { return _simple_scan_scheduler; }
|
||||
|
||||
virtual void reschedule_scanner_ctx();
|
||||
void stop_scanners(RuntimeState* state);
|
||||
|
||||
int32_t get_max_thread_num() const { return _max_thread_num; }
|
||||
void set_max_thread_num(int32_t num) { _max_thread_num = num; }
|
||||
|
||||
int batch_size() const { return _batch_size; }
|
||||
|
||||
// the unique id of this context
|
||||
std::string ctx_id;
|
||||
TUniqueId _query_id;
|
||||
@ -176,10 +178,16 @@ protected:
|
||||
const RowDescriptor* output_row_descriptor,
|
||||
const std::list<std::shared_ptr<ScannerDelegate>>& scanners_, int64_t limit_,
|
||||
int64_t max_bytes_in_blocks_queue_, const int num_parallel_instances,
|
||||
pipeline::ScanLocalStateBase* local_state,
|
||||
std::shared_ptr<pipeline::ScanDependency> dependency);
|
||||
pipeline::ScanLocalStateBase* local_state);
|
||||
|
||||
void _set_scanner_done();
|
||||
/// Four criteria to determine whether to increase the parallelism of the scanners
|
||||
/// 1. It ran for at least `SCALE_UP_DURATION` ms after last scale up
|
||||
/// 2. Half(`WAIT_BLOCK_DURATION_RATIO`) of the duration is waiting to get blocks
|
||||
/// 3. `_free_blocks_memory_usage` < `_max_bytes_in_queue`, remains enough memory to scale up
|
||||
/// 4. At most scale up `MAX_SCALE_UP_RATIO` times to `_max_thread_num`
|
||||
virtual void _set_scanner_done() {};
|
||||
|
||||
void _try_to_scale_up();
|
||||
|
||||
RuntimeState* _state = nullptr;
|
||||
VScanNode* _parent = nullptr;
|
||||
@ -189,97 +197,52 @@ protected:
|
||||
const TupleDescriptor* _output_tuple_desc = nullptr;
|
||||
const RowDescriptor* _output_row_descriptor = nullptr;
|
||||
|
||||
// _transfer_lock is used to protect the critical section
|
||||
// where the ScanNode and ScannerScheduler interact.
|
||||
// Including access to variables such as blocks_queue, _process_status, _is_finished, etc.
|
||||
std::mutex _transfer_lock;
|
||||
// The blocks got from scanners will be added to the "blocks_queue".
|
||||
// And the upper scan node will be as a consumer to fetch blocks from this queue.
|
||||
// Should be protected by "_transfer_lock"
|
||||
std::list<vectorized::BlockUPtr> _blocks_queue;
|
||||
// Wait in get_block_from_queue(), by ScanNode.
|
||||
std::condition_variable _blocks_queue_added_cv;
|
||||
// Wait in clear_and_join(), by ScanNode.
|
||||
std::condition_variable _ctx_finish_cv;
|
||||
std::list<std::shared_ptr<ScanTask>> _blocks_queue;
|
||||
|
||||
// The following 3 variables control the process of the scanner scheduling.
|
||||
// Use _transfer_lock to protect them.
|
||||
// 1. _process_status
|
||||
// indicates the global status of this scanner context.
|
||||
// Set to non-ok if encounter errors.
|
||||
// And if it is non-ok, the scanner process should stop.
|
||||
// Set be set by either ScanNode or ScannerScheduler.
|
||||
// 2. _should_stop
|
||||
// Always be set by ScanNode.
|
||||
// True means no more data need to be read(reach limit or closed)
|
||||
// 3. _is_finished
|
||||
// Always be set by ScannerScheduler.
|
||||
// True means all scanners are finished to scan.
|
||||
Status _process_status;
|
||||
Status _process_status = Status::OK();
|
||||
std::atomic_bool _should_stop = false;
|
||||
std::atomic_bool _is_finished = false;
|
||||
|
||||
// Lazy-allocated blocks for all scanners to share, for memory reuse.
|
||||
moodycamel::ConcurrentQueue<vectorized::BlockUPtr> _free_blocks;
|
||||
std::atomic<int32_t> _serving_blocks_num = 0;
|
||||
// The current number of free blocks available to the scanners.
|
||||
// Used to limit the memory usage of the scanner.
|
||||
// NOTE: this is NOT the size of `_free_blocks`.
|
||||
int32_t _free_blocks_capacity = 0;
|
||||
int64_t _estimated_block_bytes = 0;
|
||||
|
||||
int _batch_size;
|
||||
// The limit from SQL's limit clause
|
||||
int64_t limit;
|
||||
|
||||
// Current number of running scanners.
|
||||
std::atomic_int32_t _num_running_scanners = 0;
|
||||
// Current number of ctx being scheduled.
|
||||
// After each Scanner finishes a task, it will put the corresponding ctx
|
||||
// back into the scheduling queue.
|
||||
// Therefore, there will be multiple pointer of same ctx in the scheduling queue.
|
||||
// Here we record the number of ctx in the scheduling queue to clean up at the end.
|
||||
std::atomic_int32_t _num_scheduling_ctx = 0;
|
||||
// Num of unfinished scanners. Should be set in init()
|
||||
std::atomic_int32_t _num_unfinished_scanners = 0;
|
||||
// Max number of scan thread for this scanner context.
|
||||
int32_t _max_thread_num = 0;
|
||||
// How many blocks a scanner can use in one task.
|
||||
int32_t _block_per_scanner = 0;
|
||||
|
||||
// The current bytes of blocks in blocks queue
|
||||
int64_t _cur_bytes_in_queue = 0;
|
||||
// The max limit bytes of blocks in blocks queue
|
||||
const int64_t _max_bytes_in_queue;
|
||||
|
||||
int64_t _max_bytes_in_queue;
|
||||
doris::vectorized::ScannerScheduler* _scanner_scheduler;
|
||||
SimplifiedScanScheduler* _simple_scan_scheduler = nullptr;
|
||||
// List "scanners" saves all "unfinished" scanners.
|
||||
// The scanner scheduler will pop scanners from this list, run scanner,
|
||||
// and then if the scanner is not finished, will be pushed back to this list.
|
||||
// Not need to protect by lock, because only one scheduler thread will access to it.
|
||||
std::mutex _scanners_lock;
|
||||
// Scanner's ownership belong to vscannode or scanoperator, scanner context does not own it.
|
||||
// ScannerContext has to check if scanner is deconstructed before use it.
|
||||
std::list<std::weak_ptr<ScannerDelegate>> _scanners;
|
||||
moodycamel::ConcurrentQueue<std::weak_ptr<ScannerDelegate>> _scanners;
|
||||
int32_t _num_scheduled_scanners = 0;
|
||||
int32_t _num_finished_scanners = 0;
|
||||
int32_t _num_running_scanners = 0;
|
||||
// weak pointer for _scanners, used in stop function
|
||||
std::vector<std::weak_ptr<ScannerDelegate>> _all_scanners;
|
||||
std::vector<int64_t> _finished_scanner_runtime;
|
||||
std::vector<int64_t> _finished_scanner_rows_read;
|
||||
std::vector<int64_t> _finished_scanner_wait_worker_time;
|
||||
|
||||
const int _num_parallel_instances;
|
||||
|
||||
std::shared_ptr<RuntimeProfile> _scanner_profile;
|
||||
RuntimeProfile::Counter* _scanner_sched_counter = nullptr;
|
||||
RuntimeProfile::Counter* _scanner_ctx_sched_counter = nullptr;
|
||||
RuntimeProfile::Counter* _scanner_ctx_sched_time = nullptr;
|
||||
RuntimeProfile::HighWaterMarkCounter* _free_blocks_memory_usage = nullptr;
|
||||
RuntimeProfile::HighWaterMarkCounter* _queued_blocks_memory_usage = nullptr;
|
||||
RuntimeProfile::Counter* _newly_create_free_blocks_num = nullptr;
|
||||
RuntimeProfile::Counter* _scanner_wait_batch_timer = nullptr;
|
||||
RuntimeProfile::HighWaterMarkCounter* _free_blocks_memory_usage_mark = nullptr;
|
||||
RuntimeProfile::Counter* _scanner_ctx_sched_time = nullptr;
|
||||
RuntimeProfile::Counter* _scale_up_scanners_counter = nullptr;
|
||||
|
||||
std::shared_ptr<pipeline::ScanDependency> _dependency = nullptr;
|
||||
// for scaling up the running scanners
|
||||
std::mutex _free_blocks_lock;
|
||||
size_t _estimated_block_size = 0;
|
||||
int64_t _free_blocks_memory_usage = 0;
|
||||
int64_t _last_scale_up_time = 0;
|
||||
int64_t _last_fetch_time = 0;
|
||||
int64_t _total_wait_block_time = 0;
|
||||
double _last_wait_duration_ratio = 0;
|
||||
const int64_t SCALE_UP_DURATION = 5000; // 5000ms
|
||||
const float WAIT_BLOCK_DURATION_RATIO = 0.5;
|
||||
const float SCALE_UP_RATIO = 0.5;
|
||||
float MAX_SCALE_UP_RATIO;
|
||||
};
|
||||
} // namespace vectorized
|
||||
} // namespace doris
|
||||
|
||||
@ -69,10 +69,6 @@ ScannerScheduler::~ScannerScheduler() {
|
||||
return;
|
||||
}
|
||||
|
||||
for (int i = 0; i < QUEUE_NUM; i++) {
|
||||
delete _pending_queues[i];
|
||||
}
|
||||
delete[] _pending_queues;
|
||||
_deregister_metrics();
|
||||
}
|
||||
|
||||
@ -81,18 +77,12 @@ void ScannerScheduler::stop() {
|
||||
return;
|
||||
}
|
||||
|
||||
for (int i = 0; i < QUEUE_NUM; i++) {
|
||||
_pending_queues[i]->shutdown();
|
||||
}
|
||||
|
||||
_is_closed = true;
|
||||
|
||||
_scheduler_pool->shutdown();
|
||||
_local_scan_thread_pool->shutdown();
|
||||
_remote_scan_thread_pool->shutdown();
|
||||
_limited_scan_thread_pool->shutdown();
|
||||
|
||||
_scheduler_pool->wait();
|
||||
_local_scan_thread_pool->join();
|
||||
_remote_scan_thread_pool->join();
|
||||
_limited_scan_thread_pool->wait();
|
||||
@ -101,24 +91,12 @@ void ScannerScheduler::stop() {
|
||||
}
|
||||
|
||||
Status ScannerScheduler::init(ExecEnv* env) {
|
||||
// 1. scheduling thread pool and scheduling queues
|
||||
static_cast<void>(ThreadPoolBuilder("SchedulingThreadPool")
|
||||
.set_min_threads(QUEUE_NUM)
|
||||
.set_max_threads(QUEUE_NUM)
|
||||
.build(&_scheduler_pool));
|
||||
|
||||
_pending_queues = new BlockingQueue<std::shared_ptr<ScannerContext>>*[QUEUE_NUM];
|
||||
for (int i = 0; i < QUEUE_NUM; i++) {
|
||||
_pending_queues[i] = new BlockingQueue<std::shared_ptr<ScannerContext>>(INT32_MAX);
|
||||
static_cast<void>(_scheduler_pool->submit_func([this, i] { this->_schedule_thread(i); }));
|
||||
}
|
||||
|
||||
// 2. local scan thread pool
|
||||
// 1. local scan thread pool
|
||||
_local_scan_thread_pool = std::make_unique<PriorityThreadPool>(
|
||||
config::doris_scanner_thread_pool_thread_num,
|
||||
config::doris_scanner_thread_pool_queue_size, "local_scan");
|
||||
|
||||
// 3. remote scan thread pool
|
||||
// 2. remote scan thread pool
|
||||
_remote_thread_pool_max_size = config::doris_max_remote_scanner_thread_pool_thread_num != -1
|
||||
? config::doris_max_remote_scanner_thread_pool_thread_num
|
||||
: std::max(512, CpuInfo::num_cores() * 10);
|
||||
@ -128,7 +106,7 @@ Status ScannerScheduler::init(ExecEnv* env) {
|
||||
_remote_thread_pool_max_size, config::doris_remote_scanner_thread_pool_queue_size,
|
||||
"RemoteScanThreadPool");
|
||||
|
||||
// 4. limited scan thread pool
|
||||
// 3. limited scan thread pool
|
||||
static_cast<void>(ThreadPoolBuilder("LimitedScanThreadPool")
|
||||
.set_min_threads(config::doris_scanner_thread_pool_thread_num)
|
||||
.set_max_threads(config::doris_scanner_thread_pool_thread_num)
|
||||
@ -139,15 +117,75 @@ Status ScannerScheduler::init(ExecEnv* env) {
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status ScannerScheduler::submit(std::shared_ptr<ScannerContext> ctx) {
|
||||
void ScannerScheduler::submit(std::shared_ptr<ScannerContext> ctx,
|
||||
std::shared_ptr<ScanTask> scan_task) {
|
||||
scan_task->last_submit_time = GetCurrentTimeNanos();
|
||||
if (ctx->done()) {
|
||||
return Status::EndOfFile("ScannerContext is done");
|
||||
return;
|
||||
}
|
||||
ctx->queue_idx = (_queue_idx++ % QUEUE_NUM);
|
||||
if (!_pending_queues[ctx->queue_idx]->blocking_put(ctx)) {
|
||||
return Status::InternalError("failed to submit scanner context to scheduler");
|
||||
auto task_lock = ctx->task_exec_ctx();
|
||||
if (task_lock == nullptr) {
|
||||
LOG(INFO) << "could not lock task execution context, query " << ctx->debug_string()
|
||||
<< " maybe finished";
|
||||
return;
|
||||
}
|
||||
|
||||
// Submit scanners to thread pool
|
||||
// TODO(cmy): How to handle this "nice"?
|
||||
int nice = 1;
|
||||
if (ctx->thread_token != nullptr) {
|
||||
std::shared_ptr<ScannerDelegate> scanner_delegate = scan_task->scanner.lock();
|
||||
if (scanner_delegate == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
scanner_delegate->_scanner->start_wait_worker_timer();
|
||||
auto s = ctx->thread_token->submit_func(
|
||||
[this, scanner_ref = scan_task, ctx]() { this->_scanner_scan(ctx, scanner_ref); });
|
||||
if (!s.ok()) {
|
||||
scan_task->set_status(s);
|
||||
ctx->append_block_to_queue(scan_task);
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
std::shared_ptr<ScannerDelegate> scanner_delegate = scan_task->scanner.lock();
|
||||
if (scanner_delegate == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
scanner_delegate->_scanner->start_wait_worker_timer();
|
||||
TabletStorageType type = scanner_delegate->_scanner->get_storage_type();
|
||||
bool ret = false;
|
||||
if (type == TabletStorageType::STORAGE_TYPE_LOCAL) {
|
||||
if (auto* scan_sche = ctx->get_simple_scan_scheduler()) {
|
||||
auto work_func = [this, scanner_ref = scan_task, ctx]() {
|
||||
this->_scanner_scan(ctx, scanner_ref);
|
||||
};
|
||||
SimplifiedScanTask simple_scan_task = {work_func, ctx};
|
||||
ret = scan_sche->get_scan_queue()->try_put(simple_scan_task);
|
||||
} else {
|
||||
PriorityThreadPool::Task task;
|
||||
task.work_function = [this, scanner_ref = scan_task, ctx]() {
|
||||
this->_scanner_scan(ctx, scanner_ref);
|
||||
};
|
||||
task.priority = nice;
|
||||
ret = _local_scan_thread_pool->offer(task);
|
||||
}
|
||||
} else {
|
||||
PriorityThreadPool::Task task;
|
||||
task.work_function = [this, scanner_ref = scan_task, ctx]() {
|
||||
this->_scanner_scan(ctx, scanner_ref);
|
||||
};
|
||||
task.priority = nice;
|
||||
ret = _remote_scan_thread_pool->offer(task);
|
||||
}
|
||||
if (!ret) {
|
||||
scan_task->set_status(
|
||||
Status::InternalError("Failed to submit scanner to scanner pool"));
|
||||
ctx->append_block_to_queue(scan_task);
|
||||
return;
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
std::unique_ptr<ThreadPoolToken> ScannerScheduler::new_limited_scan_pool_token(
|
||||
@ -155,135 +193,21 @@ std::unique_ptr<ThreadPoolToken> ScannerScheduler::new_limited_scan_pool_token(
|
||||
return _limited_scan_thread_pool->new_token(mode, max_concurrency);
|
||||
}
|
||||
|
||||
void ScannerScheduler::_schedule_thread(int queue_id) {
|
||||
BlockingQueue<std::shared_ptr<ScannerContext>>* queue = _pending_queues[queue_id];
|
||||
while (!_is_closed) {
|
||||
std::shared_ptr<ScannerContext> ctx;
|
||||
bool ok = queue->blocking_get(&ctx);
|
||||
if (!ok) {
|
||||
// maybe closed
|
||||
continue;
|
||||
}
|
||||
|
||||
_schedule_scanners(ctx);
|
||||
// If ctx is done, no need to schedule it again.
|
||||
// But should notice that there may still scanners running in scanner pool.
|
||||
}
|
||||
}
|
||||
|
||||
void ScannerScheduler::_schedule_scanners(std::shared_ptr<ScannerContext> ctx) {
|
||||
void ScannerScheduler::_scanner_scan(std::shared_ptr<ScannerContext> ctx,
|
||||
std::shared_ptr<ScanTask> scan_task) {
|
||||
// record the time from scanner submission to actual execution in nanoseconds
|
||||
ctx->incr_ctx_scheduling_time(GetCurrentTimeNanos() - scan_task->last_submit_time);
|
||||
auto task_lock = ctx->task_exec_ctx();
|
||||
if (task_lock == nullptr) {
|
||||
LOG(INFO) << "could not lock task execution context, query " << ctx->debug_string()
|
||||
<< " maybe finished";
|
||||
return;
|
||||
}
|
||||
MonotonicStopWatch watch;
|
||||
watch.reset();
|
||||
watch.start();
|
||||
ctx->incr_num_ctx_scheduling(1);
|
||||
|
||||
if (ctx->done()) {
|
||||
return;
|
||||
}
|
||||
|
||||
std::list<std::weak_ptr<ScannerDelegate>> this_run;
|
||||
ctx->get_next_batch_of_scanners(&this_run);
|
||||
if (this_run.empty()) {
|
||||
// There running scanner will schedule the ctx after they are finished.
|
||||
// So here we just return to stop scheduling ctx.
|
||||
return;
|
||||
}
|
||||
|
||||
ctx->inc_num_running_scanners(this_run.size());
|
||||
|
||||
// Submit scanners to thread pool
|
||||
// TODO(cmy): How to handle this "nice"?
|
||||
int nice = 1;
|
||||
auto iter = this_run.begin();
|
||||
if (ctx->thread_token != nullptr) {
|
||||
// TODO llj tg how to treat this?
|
||||
while (iter != this_run.end()) {
|
||||
std::shared_ptr<ScannerDelegate> scanner_delegate = (*iter).lock();
|
||||
if (scanner_delegate == nullptr) {
|
||||
// Has to ++, or there is a dead loop
|
||||
iter++;
|
||||
continue;
|
||||
}
|
||||
scanner_delegate->_scanner->start_wait_worker_timer();
|
||||
auto s = ctx->thread_token->submit_func([this, scanner_ref = *iter, ctx]() {
|
||||
this->_scanner_scan(this, ctx, scanner_ref);
|
||||
});
|
||||
if (s.ok()) {
|
||||
iter++;
|
||||
} else {
|
||||
ctx->set_status_on_error(s);
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
while (iter != this_run.end()) {
|
||||
std::shared_ptr<ScannerDelegate> scanner_delegate = (*iter).lock();
|
||||
if (scanner_delegate == nullptr) {
|
||||
// Has to ++, or there is a dead loop
|
||||
iter++;
|
||||
continue;
|
||||
}
|
||||
scanner_delegate->_scanner->start_wait_worker_timer();
|
||||
TabletStorageType type = scanner_delegate->_scanner->get_storage_type();
|
||||
bool ret = false;
|
||||
if (type == TabletStorageType::STORAGE_TYPE_LOCAL) {
|
||||
if (auto* scan_sche = ctx->get_simple_scan_scheduler()) {
|
||||
auto work_func = [this, scanner_ref = *iter, ctx]() {
|
||||
this->_scanner_scan(this, ctx, scanner_ref);
|
||||
};
|
||||
SimplifiedScanTask simple_scan_task = {work_func, ctx};
|
||||
ret = scan_sche->get_scan_queue()->try_put(simple_scan_task);
|
||||
} else {
|
||||
PriorityThreadPool::Task task;
|
||||
task.work_function = [this, scanner_ref = *iter, ctx]() {
|
||||
this->_scanner_scan(this, ctx, scanner_ref);
|
||||
};
|
||||
task.priority = nice;
|
||||
ret = _local_scan_thread_pool->offer(task);
|
||||
}
|
||||
} else {
|
||||
PriorityThreadPool::Task task;
|
||||
task.work_function = [this, scanner_ref = *iter, ctx]() {
|
||||
this->_scanner_scan(this, ctx, scanner_ref);
|
||||
};
|
||||
task.priority = nice;
|
||||
ret = _remote_scan_thread_pool->offer(task);
|
||||
}
|
||||
if (ret) {
|
||||
iter++;
|
||||
} else {
|
||||
ctx->set_status_on_error(
|
||||
Status::InternalError("failed to submit scanner to scanner pool"));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
ctx->incr_ctx_scheduling_time(watch.elapsed_time());
|
||||
}
|
||||
|
||||
void ScannerScheduler::_scanner_scan(ScannerScheduler* scheduler,
|
||||
std::shared_ptr<ScannerContext> ctx,
|
||||
std::weak_ptr<ScannerDelegate> scanner_ref) {
|
||||
auto task_lock = ctx->task_exec_ctx();
|
||||
if (task_lock == nullptr) {
|
||||
// LOG(WARNING) << "could not lock task execution context, query " << print_id(_query_id)
|
||||
// << " maybe finished";
|
||||
return;
|
||||
}
|
||||
//LOG_EVERY_N(INFO, 100) << "start running scanner from ctx " << ctx->debug_string();
|
||||
// will release scanner if it is the last one, task lock is hold here, to ensure
|
||||
// that scanner could call scannode's method during deconstructor
|
||||
std::shared_ptr<ScannerDelegate> scanner_delegate = scanner_ref.lock();
|
||||
auto& scanner = scanner_delegate->_scanner;
|
||||
std::shared_ptr<ScannerDelegate> scanner_delegate = scan_task->scanner.lock();
|
||||
if (scanner_delegate == nullptr) {
|
||||
return;
|
||||
}
|
||||
|
||||
VScannerSPtr& scanner = scanner_delegate->_scanner;
|
||||
SCOPED_ATTACH_TASK(scanner->runtime_state());
|
||||
// for cpu hard limit, thread name should not be reset
|
||||
if (ctx->_should_reset_thread_name) {
|
||||
@ -306,14 +230,13 @@ void ScannerScheduler::_scanner_scan(ScannerScheduler* scheduler,
|
||||
if (!scanner->is_init()) {
|
||||
status = scanner->init();
|
||||
if (!status.ok()) {
|
||||
ctx->set_status_on_error(status);
|
||||
eos = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (!eos && !scanner->is_open()) {
|
||||
status = scanner->open(state);
|
||||
if (!status.ok()) {
|
||||
ctx->set_status_on_error(status);
|
||||
eos = true;
|
||||
}
|
||||
scanner->set_opened();
|
||||
@ -321,46 +244,21 @@ void ScannerScheduler::_scanner_scan(ScannerScheduler* scheduler,
|
||||
|
||||
static_cast<void>(scanner->try_append_late_arrival_runtime_filter());
|
||||
|
||||
// Because we use thread pool to scan data from storage. One scanner can't
|
||||
// use this thread too long, this can starve other query's scanner. So, we
|
||||
// need yield this thread when we do enough work. However, OlapStorage read
|
||||
// data in pre-aggregate mode, then we can't use storage returned data to
|
||||
// judge if we need to yield. So we record all raw data read in this round
|
||||
// scan, if this exceeds row number or bytes threshold, we yield this thread.
|
||||
std::vector<vectorized::BlockUPtr> blocks;
|
||||
int64_t raw_bytes_read = 0;
|
||||
int64_t raw_bytes_threshold = config::doris_scanner_row_bytes;
|
||||
int num_rows_in_block = 0;
|
||||
|
||||
// Only set to true when ctx->done() return true.
|
||||
// Use this flag because we need distinguish eos from `should_stop`.
|
||||
// If eos is true, we still need to return blocks,
|
||||
// but is should_stop is true, no need to return blocks
|
||||
bool should_stop = false;
|
||||
// Has to wait at least one full block, or it will cause a lot of schedule task in priority
|
||||
// queue, it will affect query latency and query concurrency for example ssb 3.3.
|
||||
auto should_do_scan = [&, batch_size = state->batch_size(),
|
||||
time = state->wait_full_block_schedule_times()]() {
|
||||
if (raw_bytes_read < raw_bytes_threshold) {
|
||||
return true;
|
||||
} else if (num_rows_in_block < batch_size) {
|
||||
return raw_bytes_read < raw_bytes_threshold * time;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
while (!eos && should_do_scan()) {
|
||||
// TODO llj task group should should_yield?
|
||||
bool first_read = true;
|
||||
while (!eos) {
|
||||
if (UNLIKELY(ctx->done())) {
|
||||
// No need to set status on error here.
|
||||
// Because done() maybe caused by "should_stop"
|
||||
should_stop = true;
|
||||
eos = true;
|
||||
break;
|
||||
}
|
||||
BlockUPtr free_block = nullptr;
|
||||
if (first_read) {
|
||||
status = scanner->get_block_after_projects(state, scan_task->current_block.get(), &eos);
|
||||
first_read = false;
|
||||
} else {
|
||||
free_block = ctx->get_free_block();
|
||||
status = scanner->get_block_after_projects(state, free_block.get(), &eos);
|
||||
}
|
||||
|
||||
BlockUPtr block = ctx->get_free_block();
|
||||
|
||||
status = scanner->get_block_after_projects(state, block.get(), &eos);
|
||||
// The VFileScanner for external table may try to open not exist files,
|
||||
// Because FE file cache for external table may out of date.
|
||||
// So, NOT_FOUND for VFileScanner is not a fail case.
|
||||
@ -370,49 +268,36 @@ void ScannerScheduler::_scanner_scan(ScannerScheduler* scheduler,
|
||||
!status.is<ErrorCode::NOT_FOUND>()))) {
|
||||
LOG(WARNING) << "Scan thread read VScanner failed: " << status.to_string();
|
||||
break;
|
||||
}
|
||||
VLOG_ROW << "VScanNode input rows: " << block->rows() << ", eos: " << eos;
|
||||
if (status.is<ErrorCode::NOT_FOUND>()) {
|
||||
} else if (status.is<ErrorCode::NOT_FOUND>()) {
|
||||
// The only case in this "if" branch is external table file delete and fe cache has not been updated yet.
|
||||
// Set status to OK.
|
||||
status = Status::OK();
|
||||
eos = true;
|
||||
break;
|
||||
}
|
||||
|
||||
raw_bytes_read += block->allocated_bytes();
|
||||
num_rows_in_block += block->rows();
|
||||
if (UNLIKELY(block->rows() == 0)) {
|
||||
ctx->return_free_block(std::move(block));
|
||||
} else {
|
||||
if (!blocks.empty() && blocks.back()->rows() + block->rows() <= state->batch_size()) {
|
||||
vectorized::MutableBlock mutable_block(blocks.back().get());
|
||||
static_cast<void>(mutable_block.merge(*block));
|
||||
blocks.back().get()->set_columns(std::move(mutable_block.mutable_columns()));
|
||||
ctx->return_free_block(std::move(block));
|
||||
} else {
|
||||
blocks.push_back(std::move(block));
|
||||
}
|
||||
if (!first_read && free_block) {
|
||||
vectorized::MutableBlock mutable_block(scan_task->current_block.get());
|
||||
static_cast<void>(mutable_block.merge(*free_block));
|
||||
scan_task->current_block->set_columns(std::move(mutable_block.mutable_columns()));
|
||||
ctx->return_free_block(std::move(free_block));
|
||||
}
|
||||
if (scan_task->current_block->rows() >= ctx->batch_size()) {
|
||||
break;
|
||||
}
|
||||
} // end for while
|
||||
|
||||
// if we failed, check status.
|
||||
if (UNLIKELY(!status.ok())) {
|
||||
// _transfer_done = true;
|
||||
ctx->set_status_on_error(status);
|
||||
scan_task->set_status(status);
|
||||
eos = true;
|
||||
blocks.clear();
|
||||
} else if (should_stop) {
|
||||
// No need to return blocks because of should_stop, just delete them
|
||||
blocks.clear();
|
||||
} else if (!blocks.empty()) {
|
||||
ctx->append_blocks_to_queue(blocks);
|
||||
}
|
||||
|
||||
scanner->update_scan_cpu_timer();
|
||||
if (eos || should_stop) {
|
||||
if (eos) {
|
||||
scanner->mark_to_need_to_close();
|
||||
}
|
||||
ctx->push_back_scanner_and_reschedule(scanner_delegate);
|
||||
scan_task->set_eos(eos);
|
||||
ctx->append_block_to_queue(scan_task);
|
||||
}
|
||||
|
||||
void ScannerScheduler::_register_metrics() {
|
||||
|
||||
@ -37,25 +37,18 @@ class BlockingQueue;
|
||||
|
||||
namespace doris::vectorized {
|
||||
class ScannerDelegate;
|
||||
class ScanTask;
|
||||
class ScannerContext;
|
||||
|
||||
// Responsible for the scheduling and execution of all Scanners of a BE node.
|
||||
// ScannerScheduler has two types of thread pools:
|
||||
// 1. Scheduling thread pool
|
||||
// Responsible for Scanner scheduling.
|
||||
// A set of Scanners for a query will be encapsulated into a ScannerContext
|
||||
// and submitted to the ScannerScheduler's scheduling queue.
|
||||
// There are multiple scheduling queues in ScannerScheduler, and each scheduling queue
|
||||
// is handled by a scheduling thread.
|
||||
// The scheduling thread is scheduled in granularity of ScannerContext,
|
||||
// that is, a group of Scanners in a ScannerContext are scheduled at a time.
|
||||
//
|
||||
//2. Execution thread pool
|
||||
// The scheduling thread will submit the Scanners selected from the ScannerContext
|
||||
// Execution thread pool
|
||||
// When a ScannerContext is launched, it will submit the running scanners to this scheduler.
|
||||
// The scheduling thread will submit the running scanner and its ScannerContext
|
||||
// to the execution thread pool to do the actual scan task.
|
||||
// Each Scanner will act as a producer, read a group of blocks and put them into
|
||||
// Each Scanner will act as a producer, read the next block and put it into
|
||||
// the corresponding block queue.
|
||||
// The corresponding ScanNode will act as a consumer to consume blocks from the block queue.
|
||||
// After the block is consumed, the unfinished scanner will resubmit to this scheduler.
|
||||
class ScannerScheduler {
|
||||
public:
|
||||
ScannerScheduler();
|
||||
@ -63,7 +56,7 @@ public:
|
||||
|
||||
[[nodiscard]] Status init(ExecEnv* env);
|
||||
|
||||
[[nodiscard]] Status submit(std::shared_ptr<ScannerContext> ctx);
|
||||
void submit(std::shared_ptr<ScannerContext> ctx, std::shared_ptr<ScanTask> scan_task);
|
||||
|
||||
void stop();
|
||||
|
||||
@ -73,32 +66,13 @@ public:
|
||||
int remote_thread_pool_max_size() const { return _remote_thread_pool_max_size; }
|
||||
|
||||
private:
|
||||
// scheduling thread function
|
||||
void _schedule_thread(int queue_id);
|
||||
// schedule scanners in a certain ScannerContext
|
||||
void _schedule_scanners(std::shared_ptr<ScannerContext> ctx);
|
||||
// execution thread function
|
||||
void _scanner_scan(ScannerScheduler* scheduler, std::shared_ptr<ScannerContext> ctx,
|
||||
std::weak_ptr<ScannerDelegate> scanner);
|
||||
static void _scanner_scan(std::shared_ptr<ScannerContext> ctx,
|
||||
std::shared_ptr<ScanTask> scan_task);
|
||||
|
||||
void _register_metrics();
|
||||
|
||||
static void _deregister_metrics();
|
||||
|
||||
// Scheduling queue number.
|
||||
// TODO: make it configurable.
|
||||
static const int QUEUE_NUM = 4;
|
||||
// The ScannerContext will be submitted to the pending queue roundrobin.
|
||||
// _queue_idx pointer to the current queue.
|
||||
// Use std::atomic_uint to prevent numerical overflow from memory out of bound.
|
||||
// The scheduler thread will take ctx from pending queue, schedule it,
|
||||
// and put it to the _scheduling_map.
|
||||
// If any scanner finish, it will take ctx from and put it to pending queue again.
|
||||
std::atomic_uint _queue_idx = {0};
|
||||
BlockingQueue<std::shared_ptr<ScannerContext>>** _pending_queues = nullptr;
|
||||
|
||||
// scheduling thread pool
|
||||
std::unique_ptr<ThreadPool> _scheduler_pool;
|
||||
// execution thread pool
|
||||
// _local_scan_thread_pool is for local scan task(typically, olap scanner)
|
||||
// _remote_scan_thread_pool is for remote scan task(cold data on s3, hdfs, etc.)
|
||||
|
||||
@ -197,7 +197,6 @@ Status VScanNode::alloc_resource(RuntimeState* state) {
|
||||
if (_scanner_ctx) {
|
||||
DCHECK(!_eos && _num_scanners->value() > 0);
|
||||
RETURN_IF_ERROR(_scanner_ctx->init());
|
||||
RETURN_IF_ERROR(_state->exec_env()->scanner_scheduler()->submit(_scanner_ctx));
|
||||
}
|
||||
if (_shared_scan_opt) {
|
||||
//LOG(INFO) << "instance shared scan enabled"
|
||||
@ -219,7 +218,6 @@ Status VScanNode::alloc_resource(RuntimeState* state) {
|
||||
: Status::OK());
|
||||
if (_scanner_ctx) {
|
||||
RETURN_IF_ERROR(_scanner_ctx->init());
|
||||
RETURN_IF_ERROR(_state->exec_env()->scanner_scheduler()->submit(_scanner_ctx));
|
||||
}
|
||||
}
|
||||
|
||||
@ -246,14 +244,10 @@ Status VScanNode::get_next(RuntimeState* state, vectorized::Block* block, bool*
|
||||
}};
|
||||
|
||||
if (state->is_cancelled()) {
|
||||
// ISSUE: https://github.com/apache/doris/issues/16360
|
||||
// _scanner_ctx may be null here, see: `VScanNode::alloc_resource` (_eos == null)
|
||||
if (_scanner_ctx) {
|
||||
_scanner_ctx->set_status_on_error(Status::Cancelled("query cancelled"));
|
||||
return _scanner_ctx->status();
|
||||
} else {
|
||||
return Status::Cancelled("query cancelled");
|
||||
_scanner_ctx->stop_scanners(state);
|
||||
}
|
||||
return Status::Cancelled("Query cancelled in ScanNode");
|
||||
}
|
||||
|
||||
if (_eos) {
|
||||
@ -261,16 +255,7 @@ Status VScanNode::get_next(RuntimeState* state, vectorized::Block* block, bool*
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
vectorized::BlockUPtr scan_block = nullptr;
|
||||
RETURN_IF_ERROR(_scanner_ctx->get_block_from_queue(state, &scan_block, eos, _context_queue_id));
|
||||
if (*eos) {
|
||||
DCHECK(scan_block == nullptr);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// get scanner's block memory
|
||||
block->swap(*scan_block);
|
||||
_scanner_ctx->return_free_block(std::move(scan_block));
|
||||
RETURN_IF_ERROR(_scanner_ctx->get_block_from_queue(state, block, eos, _context_queue_id));
|
||||
|
||||
reached_limit(block, eos);
|
||||
if (*eos) {
|
||||
@ -294,16 +279,14 @@ Status VScanNode::_init_profile() {
|
||||
runtime_profile()->add_child(_scanner_profile.get(), true, nullptr);
|
||||
|
||||
_memory_usage_counter = ADD_LABEL_COUNTER(_scanner_profile, "MemoryUsage");
|
||||
_queued_blocks_memory_usage =
|
||||
_scanner_profile->AddHighWaterMarkCounter("QueuedBlocks", TUnit::BYTES, "MemoryUsage");
|
||||
_free_blocks_memory_usage =
|
||||
_scanner_profile->AddHighWaterMarkCounter("FreeBlocks", TUnit::BYTES, "MemoryUsage");
|
||||
_newly_create_free_blocks_num =
|
||||
ADD_COUNTER(_scanner_profile, "NewlyCreateFreeBlocksNum", TUnit::UNIT);
|
||||
_scale_up_scanners_counter = ADD_COUNTER(_scanner_profile, "NumScaleUpScanners", TUnit::UNIT);
|
||||
// time of transfer thread to wait for block from scan thread
|
||||
_scanner_wait_batch_timer = ADD_TIMER(_scanner_profile, "ScannerBatchWaitTime");
|
||||
_scanner_sched_counter = ADD_COUNTER(_scanner_profile, "ScannerSchedCount", TUnit::UNIT);
|
||||
_scanner_ctx_sched_counter = ADD_COUNTER(_scanner_profile, "ScannerCtxSchedCount", TUnit::UNIT);
|
||||
_scanner_ctx_sched_time = ADD_TIMER(_scanner_profile, "ScannerCtxSchedTime");
|
||||
|
||||
_scan_timer = ADD_TIMER(_scanner_profile, "ScannerGetBlockTime");
|
||||
|
||||
@ -377,7 +377,6 @@ protected:
|
||||
RuntimeProfile::Counter* _filter_timer = nullptr;
|
||||
|
||||
RuntimeProfile::Counter* _scanner_sched_counter = nullptr;
|
||||
RuntimeProfile::Counter* _scanner_ctx_sched_counter = nullptr;
|
||||
RuntimeProfile::Counter* _scanner_ctx_sched_time = nullptr;
|
||||
RuntimeProfile::Counter* _scanner_wait_batch_timer = nullptr;
|
||||
RuntimeProfile::Counter* _scanner_wait_worker_timer = nullptr;
|
||||
@ -387,8 +386,8 @@ protected:
|
||||
RuntimeProfile::Counter* _max_scanner_thread_num = nullptr;
|
||||
|
||||
RuntimeProfile::Counter* _memory_usage_counter = nullptr;
|
||||
RuntimeProfile::HighWaterMarkCounter* _queued_blocks_memory_usage = nullptr;
|
||||
RuntimeProfile::HighWaterMarkCounter* _free_blocks_memory_usage = nullptr;
|
||||
RuntimeProfile::Counter* _scale_up_scanners_counter = nullptr;
|
||||
|
||||
std::unordered_map<std::string, int> _colname_to_slot_id;
|
||||
|
||||
|
||||
@ -74,6 +74,8 @@ public class SessionVariable implements Serializable, Writable {
|
||||
|
||||
public static final String EXEC_MEM_LIMIT = "exec_mem_limit";
|
||||
public static final String SCAN_QUEUE_MEM_LIMIT = "scan_queue_mem_limit";
|
||||
public static final String NUM_SCANNER_THREADS = "num_scanner_threads";
|
||||
public static final String SCANNER_SCALE_UP_RATIO = "scanner_scale_up_ratio";
|
||||
public static final String QUERY_TIMEOUT = "query_timeout";
|
||||
public static final String ANALYZE_TIMEOUT = "analyze_timeout";
|
||||
|
||||
@ -553,6 +555,20 @@ public class SessionVariable implements Serializable, Writable {
|
||||
@VariableMgr.VarAttr(name = SCAN_QUEUE_MEM_LIMIT)
|
||||
public long maxScanQueueMemByte = 2147483648L / 20;
|
||||
|
||||
@VariableMgr.VarAttr(name = NUM_SCANNER_THREADS, needForward = true, description = {
|
||||
"ScanNode扫描数据的最大并发,默认为0,采用BE的doris_scanner_thread_pool_thread_num",
|
||||
"The max threads to read data of ScanNode, "
|
||||
+ "default 0, use doris_scanner_thread_pool_thread_num in be.conf"
|
||||
})
|
||||
public int numScannerThreads = 0;
|
||||
|
||||
@VariableMgr.VarAttr(name = SCANNER_SCALE_UP_RATIO, needForward = true, description = {
|
||||
"ScanNode自适应的增加扫描并发,最大允许增长的并发倍率,默认为0,关闭该功能",
|
||||
"The max multiple of increasing the concurrency of scanners adaptively, "
|
||||
+ "default 0, turn off scaling up"
|
||||
})
|
||||
public double scannerScaleUpRatio = 0;
|
||||
|
||||
@VariableMgr.VarAttr(name = ENABLE_SPILLING)
|
||||
public boolean enableSpilling = false;
|
||||
|
||||
@ -1790,6 +1806,14 @@ public class SessionVariable implements Serializable, Writable {
|
||||
return maxScanQueueMemByte;
|
||||
}
|
||||
|
||||
public int getNumScannerThreads() {
|
||||
return numScannerThreads;
|
||||
}
|
||||
|
||||
public double getScannerScaleUpRatio() {
|
||||
return scannerScaleUpRatio;
|
||||
}
|
||||
|
||||
public int getQueryTimeoutS() {
|
||||
return queryTimeoutS;
|
||||
}
|
||||
@ -1962,7 +1986,15 @@ public class SessionVariable implements Serializable, Writable {
|
||||
}
|
||||
|
||||
public void setMaxScanQueueMemByte(long scanQueueMemByte) {
|
||||
this.maxScanQueueMemByte = Math.min(scanQueueMemByte, maxExecMemByte / 2);
|
||||
this.maxScanQueueMemByte = scanQueueMemByte;
|
||||
}
|
||||
|
||||
public void setNumScannerThreads(int numScannerThreads) {
|
||||
this.numScannerThreads = numScannerThreads;
|
||||
}
|
||||
|
||||
public void setScannerScaleUpRatio(double scannerScaleUpRatio) {
|
||||
this.scannerScaleUpRatio = scannerScaleUpRatio;
|
||||
}
|
||||
|
||||
public boolean isSqlQuoteShowCreate() {
|
||||
@ -2771,7 +2803,9 @@ public class SessionVariable implements Serializable, Writable {
|
||||
public TQueryOptions toThrift() {
|
||||
TQueryOptions tResult = new TQueryOptions();
|
||||
tResult.setMemLimit(maxExecMemByte);
|
||||
tResult.setScanQueueMemLimit(Math.min(maxScanQueueMemByte, maxExecMemByte / 20));
|
||||
tResult.setScanQueueMemLimit(maxScanQueueMemByte);
|
||||
tResult.setNumScannerThreads(numScannerThreads);
|
||||
tResult.setScannerScaleUpRatio(scannerScaleUpRatio);
|
||||
|
||||
// TODO chenhao, reservation will be calculated by cost
|
||||
tResult.setMinReservation(0);
|
||||
@ -3067,7 +3101,9 @@ public class SessionVariable implements Serializable, Writable {
|
||||
public TQueryOptions getQueryOptionVariables() {
|
||||
TQueryOptions queryOptions = new TQueryOptions();
|
||||
queryOptions.setMemLimit(maxExecMemByte);
|
||||
queryOptions.setScanQueueMemLimit(Math.min(maxScanQueueMemByte, maxExecMemByte / 20));
|
||||
queryOptions.setScanQueueMemLimit(maxScanQueueMemByte);
|
||||
queryOptions.setNumScannerThreads(numScannerThreads);
|
||||
queryOptions.setScannerScaleUpRatio(scannerScaleUpRatio);
|
||||
queryOptions.setQueryTimeout(queryTimeoutS);
|
||||
queryOptions.setInsertTimeout(insertTimeoutS);
|
||||
queryOptions.setAnalyzeTimeout(analyzeTimeoutS);
|
||||
|
||||
@ -271,6 +271,8 @@ struct TQueryOptions {
|
||||
97: optional i64 parallel_scan_min_rows_per_scanner = 0;
|
||||
|
||||
98: optional bool skip_bad_tablet = false;
|
||||
// Increase concurrency of scanners adaptively, the maxinum times to scale up
|
||||
99: optional double scanner_scale_up_ratio = 0;
|
||||
|
||||
// For cloud, to control if the content would be written into file cache
|
||||
1000: optional bool disable_file_cache = false
|
||||
|
||||
Reference in New Issue
Block a user