[refactor](scan-pool) move scan pool from env to scanner scheduler (#15604)

The origin scan pools are in exec_env.
But after enable new_load_scan_node by default, the scan pool in exec_env is no longer used.
All scan task will be submitted to the scan pool in scanner_scheduler.

BTW, reorganize the scan pool into 3 kinds:

local scan pool
For olap scan node

remote scan pool
For file scan node

limited scan pool
For query which set cpu resource limit or with small limit clause

TODO:
Use bthread to unify all IO task.

Some trivial issues:

fix bug that the memtable flush size printed in log is not right
Add RuntimeProfile param in VScanner
This commit is contained in:
Mingyu Chen
2023-01-11 09:38:42 +08:00
committed by GitHub
parent d857b4af1b
commit 3fec5ff0f5
27 changed files with 72 additions and 90 deletions

View File

@ -356,8 +356,7 @@ Status MemTable::_do_flush(int64_t& duration_ns) {
SCOPED_RAW_TIMER(&duration_ns);
_collect_vskiplist_results<true>();
vectorized::Block block = _output_mutable_block.to_block();
RETURN_NOT_OK(_rowset_writer->flush_single_memtable(&block));
_flush_size = block.allocated_bytes();
RETURN_NOT_OK(_rowset_writer->flush_single_memtable(&block, &_flush_size));
return Status::OK();
}

View File

@ -88,11 +88,12 @@ void FlushToken::_flush_memtable(MemTable* memtable, int64_t submit_task_time) {
MonotonicStopWatch timer;
timer.start();
size_t memory_usage = memtable->memory_usage();
Status s = memtable->flush();
if (!s) {
LOG(WARNING) << "Flush memtable failed with res = " << s;
// If s is not ok, ignore the code, just use other code is ok
_flush_status.store(ErrorCode::INTERNAL_ERROR);
_flush_status.store(s.code());
}
if (_flush_status.load() != OK) {
return;
@ -101,8 +102,7 @@ void FlushToken::_flush_memtable(MemTable* memtable, int64_t submit_task_time) {
VLOG_CRITICAL << "flush memtable cost: " << timer.elapsed_time()
<< ", running count: " << _stats.flush_running_count
<< ", finish count: " << _stats.flush_finish_count
<< ", mem size: " << memtable->memory_usage()
<< ", disk size: " << memtable->flush_size();
<< ", mem size: " << memory_usage << ", disk size: " << memtable->flush_size();
_stats.flush_time_ns += timer.elapsed_time();
_stats.flush_finish_count++;
_stats.flush_running_count--;

View File

@ -637,7 +637,7 @@ Status BetaRowsetWriter::flush() {
return Status::OK();
}
Status BetaRowsetWriter::flush_single_memtable(const vectorized::Block* block) {
Status BetaRowsetWriter::flush_single_memtable(const vectorized::Block* block, int64* flush_size) {
if (block->rows() == 0) {
return Status::OK();
}
@ -645,7 +645,7 @@ Status BetaRowsetWriter::flush_single_memtable(const vectorized::Block* block) {
std::unique_ptr<segment_v2::SegmentWriter> writer;
RETURN_NOT_OK(_create_segment_writer(&writer));
RETURN_NOT_OK(_add_block(block, &writer));
RETURN_NOT_OK(_flush_segment_writer(&writer));
RETURN_NOT_OK(_flush_segment_writer(&writer, flush_size));
return Status::OK();
}
@ -891,12 +891,12 @@ Status BetaRowsetWriter::_create_segment_writer(
std::unique_ptr<segment_v2::SegmentWriter>* writer) {
size_t total_segment_num = _num_segment - _segcompacted_point + 1 + _num_segcompacted;
if (UNLIKELY(total_segment_num > config::max_segment_num_per_rowset)) {
LOG(ERROR) << "too many segments in rowset."
<< " tablet_id:" << _context.tablet_id << " rowset_id:" << _context.rowset_id
<< " max:" << config::max_segment_num_per_rowset
<< " _num_segment:" << _num_segment
<< " _segcompacted_point:" << _segcompacted_point
<< " _num_segcompacted:" << _num_segcompacted;
LOG(WARNING) << "too many segments in rowset."
<< " tablet_id:" << _context.tablet_id << " rowset_id:" << _context.rowset_id
<< " max:" << config::max_segment_num_per_rowset
<< " _num_segment:" << _num_segment
<< " _segcompacted_point:" << _segcompacted_point
<< " _num_segcompacted:" << _num_segcompacted;
return Status::Error<TOO_MANY_SEGMENTS>();
} else {
return _do_create_segment_writer(writer, false, -1, -1);

View File

@ -56,7 +56,7 @@ public:
// Return the file size flushed to disk in "flush_size"
// This method is thread-safe.
Status flush_single_memtable(const vectorized::Block* block) override;
Status flush_single_memtable(const vectorized::Block* block, int64_t* flush_size) override;
RowsetSharedPtr build() override;

View File

@ -64,7 +64,7 @@ public:
}
virtual Status final_flush() { return Status::Error<ErrorCode::NOT_IMPLEMENTED_ERROR>(); }
virtual Status flush_single_memtable(const vectorized::Block* block) {
virtual Status flush_single_memtable(const vectorized::Block* block, int64_t* flush_size) {
return Status::Error<ErrorCode::NOT_IMPLEMENTED_ERROR>();
}

View File

@ -130,9 +130,6 @@ public:
std::shared_ptr<MemTrackerLimiter> orphan_mem_tracker() { return _orphan_mem_tracker; }
MemTrackerLimiter* orphan_mem_tracker_raw() { return _orphan_mem_tracker_raw; }
ThreadResourceMgr* thread_mgr() { return _thread_mgr; }
PriorityThreadPool* scan_thread_pool() { return _scan_thread_pool; }
PriorityThreadPool* remote_scan_thread_pool() { return _remote_scan_thread_pool; }
ThreadPool* limited_scan_thread_pool() { return _limited_scan_thread_pool.get(); }
ThreadPool* send_batch_thread_pool() { return _send_batch_thread_pool.get(); }
ThreadPool* download_cache_thread_pool() { return _download_cache_thread_pool.get(); }
void set_serial_download_cache_thread_token() {
@ -226,20 +223,6 @@ private:
std::shared_ptr<MemTrackerLimiter> _orphan_mem_tracker;
MemTrackerLimiter* _orphan_mem_tracker_raw;
// The following two thread pools are used in different scenarios.
// _scan_thread_pool is a priority thread pool.
// Scanner threads for common queries will use this thread pool,
// and the priority of each scan task is set according to the size of the query.
// _limited_scan_thread_pool is also the thread pool used for scanner.
// The difference is that it is no longer a priority queue, but according to the concurrency
// set by the user to control the number of threads that can be used by a query.
// TODO(cmy): find a better way to unify these 2 pools.
PriorityThreadPool* _scan_thread_pool = nullptr;
PriorityThreadPool* _remote_scan_thread_pool = nullptr;
std::unique_ptr<ThreadPool> _limited_scan_thread_pool;
std::unique_ptr<ThreadPool> _send_batch_thread_pool;
// Threadpool used to download cache from remote storage

View File

@ -97,29 +97,6 @@ Status ExecEnv::_init(const std::vector<StorePath>& store_paths) {
_frontend_client_cache = new FrontendServiceClientCache(config::max_client_cache_size_per_host);
_broker_client_cache = new BrokerServiceClientCache(config::max_client_cache_size_per_host);
_thread_mgr = new ThreadResourceMgr();
if (config::doris_enable_scanner_thread_pool_per_disk &&
config::doris_scanner_thread_pool_thread_num >= store_paths.size() &&
store_paths.size() > 0) {
_scan_thread_pool = new PriorityWorkStealingThreadPool(
config::doris_scanner_thread_pool_thread_num, store_paths.size(),
config::doris_scanner_thread_pool_queue_size, "olap_scanner");
LOG(INFO) << "scan thread pool use PriorityWorkStealingThreadPool";
} else {
_scan_thread_pool = new PriorityThreadPool(config::doris_scanner_thread_pool_thread_num,
config::doris_scanner_thread_pool_queue_size,
"olap_scanner");
LOG(INFO) << "scan thread pool use PriorityThreadPool";
}
_remote_scan_thread_pool = new PriorityThreadPool(
config::doris_remote_scanner_thread_pool_thread_num,
config::doris_remote_scanner_thread_pool_queue_size, "remote_scan");
ThreadPoolBuilder("LimitedScanThreadPool")
.set_min_threads(config::doris_scanner_thread_pool_thread_num)
.set_max_threads(config::doris_scanner_thread_pool_thread_num)
.set_max_queue_size(config::doris_scanner_thread_pool_queue_size)
.build(&_limited_scan_thread_pool);
ThreadPoolBuilder("SendBatchThreadPool")
.set_min_threads(config::send_batch_thread_pool_thread_num)
@ -324,9 +301,6 @@ void ExecEnv::init_download_cache_required_components() {
}
void ExecEnv::_register_metrics() {
REGISTER_HOOK_METRIC(scanner_thread_pool_queue_size,
[this]() { return _scan_thread_pool->get_queue_size(); });
REGISTER_HOOK_METRIC(send_batch_thread_pool_thread_num,
[this]() { return _send_batch_thread_pool->num_threads(); });
@ -367,8 +341,6 @@ void ExecEnv::_destroy() {
SAFE_DELETE(_fragment_mgr);
SAFE_DELETE(_pipeline_task_scheduler);
SAFE_DELETE(_cgroups_mgr);
SAFE_DELETE(_scan_thread_pool);
SAFE_DELETE(_remote_scan_thread_pool);
SAFE_DELETE(_thread_mgr);
SAFE_DELETE(_broker_client_cache);
SAFE_DELETE(_frontend_client_cache);

View File

@ -794,11 +794,15 @@ void FragmentMgr::_set_scan_concurrency(const TExecPlanFragmentParams& params,
// the thread token will be set if
// 1. the cpu_limit is set, or
// 2. the limit is very small ( < 1024)
// If the token is set, the scan task will use limited_scan_pool in scanner scheduler.
// Otherwise, the scan task will use local/remote scan pool in scanner scheduler
int concurrency = 1;
bool is_serial = false;
bool need_token = false;
if (params.query_options.__isset.resource_limit &&
params.query_options.resource_limit.__isset.cpu_limit) {
concurrency = params.query_options.resource_limit.cpu_limit;
need_token = true;
} else {
concurrency = config::doris_scanner_thread_pool_thread_num;
}
@ -816,11 +820,14 @@ void FragmentMgr::_set_scan_concurrency(const TExecPlanFragmentParams& params,
if (node.limit > 0 && node.limit < 1024) {
concurrency = 1;
is_serial = true;
need_token = true;
break;
}
}
}
fragments_ctx->set_thread_token(concurrency, is_serial);
if (need_token) {
fragments_ctx->set_thread_token(concurrency, is_serial);
}
#endif
}

View File

@ -31,6 +31,7 @@
#include "runtime/runtime_predicate.h"
#include "util/pretty_printer.h"
#include "util/threadpool.h"
#include "vec/exec/scan/scanner_scheduler.h"
#include "vec/runtime/shared_hash_table_controller.h"
namespace doris {
@ -77,7 +78,7 @@ public:
}
void set_thread_token(int concurrency, bool is_serial) {
_thread_token = _exec_env->limited_scan_thread_pool()->new_token(
_thread_token = _exec_env->scanner_scheduler()->new_limited_scan_pool_token(
is_serial ? ThreadPool::ExecutionMode::SERIAL
: ThreadPool::ExecutionMode::CONCURRENT,
concurrency);

View File

@ -148,8 +148,9 @@ Status NewEsScanNode::_init_scanners(std::list<VScanner*>* scanners) {
properties[ESScanReader::KEY_QUERY] = ESScrollQueryBuilder::build(
properties, _column_names, _docvalue_context, &doc_value_mode);
NewEsScanner* scanner = new NewEsScanner(_state, this, _limit_per_scanner, _tuple_id,
properties, _docvalue_context, doc_value_mode);
NewEsScanner* scanner =
new NewEsScanner(_state, this, _limit_per_scanner, _tuple_id, properties,
_docvalue_context, doc_value_mode, _state->runtime_profile());
_scanner_pool.add(scanner);
RETURN_IF_ERROR(scanner->prepare(_state, _vconjunct_ctx_ptr.get()));

View File

@ -26,8 +26,8 @@ namespace doris::vectorized {
NewEsScanner::NewEsScanner(RuntimeState* state, NewEsScanNode* parent, int64_t limit,
TupleId tuple_id, const std::map<std::string, std::string>& properties,
const std::map<std::string, std::string>& docvalue_context,
bool doc_value_mode)
: VScanner(state, static_cast<VScanNode*>(parent), limit),
bool doc_value_mode, RuntimeProfile* profile)
: VScanner(state, static_cast<VScanNode*>(parent), limit, profile),
_is_init(false),
_es_eof(false),
_properties(properties),

View File

@ -30,7 +30,8 @@ class NewEsScanner : public VScanner {
public:
NewEsScanner(RuntimeState* state, NewEsScanNode* parent, int64_t limit, TupleId tuple_id,
const std::map<std::string, std::string>& properties,
const std::map<std::string, std::string>& docvalue_context, bool doc_value_mode);
const std::map<std::string, std::string>& docvalue_context, bool doc_value_mode,
RuntimeProfile* profile);
Status open(RuntimeState* state) override;
Status close(RuntimeState* state) override;

View File

@ -50,8 +50,9 @@ Status NewJdbcScanNode::_init_scanners(std::list<VScanner*>* scanners) {
if (_eos == true) {
return Status::OK();
}
NewJdbcScanner* scanner = new NewJdbcScanner(_state, this, _limit_per_scanner, _tuple_id,
_query_string, _table_type);
NewJdbcScanner* scanner =
new NewJdbcScanner(_state, this, _limit_per_scanner, _tuple_id, _query_string,
_table_type, _state->runtime_profile());
_scanner_pool.add(scanner);
RETURN_IF_ERROR(scanner->prepare(_state, _vconjunct_ctx_ptr.get()));
scanners->push_back(static_cast<VScanner*>(scanner));

View File

@ -20,8 +20,8 @@
namespace doris::vectorized {
NewJdbcScanner::NewJdbcScanner(RuntimeState* state, NewJdbcScanNode* parent, int64_t limit,
const TupleId& tuple_id, const std::string& query_string,
TOdbcTableType::type table_type)
: VScanner(state, static_cast<VScanNode*>(parent), limit),
TOdbcTableType::type table_type, RuntimeProfile* profile)
: VScanner(state, static_cast<VScanNode*>(parent), limit, profile),
_is_init(false),
_jdbc_eos(false),
_tuple_id(tuple_id),

View File

@ -27,7 +27,7 @@ class NewJdbcScanner : public VScanner {
public:
NewJdbcScanner(RuntimeState* state, NewJdbcScanNode* parent, int64_t limit,
const TupleId& tuple_id, const std::string& query_string,
TOdbcTableType::type table_type);
TOdbcTableType::type table_type, RuntimeProfile* profile);
Status open(RuntimeState* state) override;
Status close(RuntimeState* state) override;

View File

@ -51,7 +51,8 @@ Status NewOdbcScanNode::_init_scanners(std::list<VScanner*>* scanners) {
if (_eos == true) {
return Status::OK();
}
NewOdbcScanner* scanner = new NewOdbcScanner(_state, this, _limit_per_scanner, _odbc_scan_node);
NewOdbcScanner* scanner = new NewOdbcScanner(_state, this, _limit_per_scanner, _odbc_scan_node,
_state->runtime_profile());
_scanner_pool.add(scanner);
RETURN_IF_ERROR(scanner->prepare(_state, _vconjunct_ctx_ptr.get()));
scanners->push_back(static_cast<VScanner*>(scanner));

View File

@ -26,8 +26,8 @@ static const std::string NEW_SCANNER_TYPE = "NewOdbcScanner";
namespace doris::vectorized {
NewOdbcScanner::NewOdbcScanner(RuntimeState* state, NewOdbcScanNode* parent, int64_t limit,
const TOdbcScanNode& odbc_scan_node)
: VScanner(state, static_cast<VScanNode*>(parent), limit),
const TOdbcScanNode& odbc_scan_node, RuntimeProfile* profile)
: VScanner(state, static_cast<VScanNode*>(parent), limit, profile),
_is_init(false),
_odbc_eof(false),
_table_name(odbc_scan_node.table_name),

View File

@ -26,7 +26,7 @@ namespace doris::vectorized {
class NewOdbcScanner : public VScanner {
public:
NewOdbcScanner(RuntimeState* state, NewOdbcScanNode* parent, int64_t limit,
const TOdbcScanNode& odbc_scan_node);
const TOdbcScanNode& odbc_scan_node, RuntimeProfile* profile);
Status open(RuntimeState* state) override;

View File

@ -26,11 +26,10 @@ namespace doris::vectorized {
NewOlapScanner::NewOlapScanner(RuntimeState* state, NewOlapScanNode* parent, int64_t limit,
bool aggregation, bool need_agg_finalize,
const TPaloScanRange& scan_range, RuntimeProfile* profile)
: VScanner(state, static_cast<VScanNode*>(parent), limit),
: VScanner(state, static_cast<VScanNode*>(parent), limit, profile),
_aggregation(aggregation),
_need_agg_finalize(need_agg_finalize),
_version(-1),
_profile(profile) {
_version(-1) {
_tablet_schema = std::make_shared<TabletSchema>();
}

View File

@ -82,7 +82,6 @@ private:
// ========= profiles ==========
int64_t _compressed_bytes_read = 0;
int64_t _raw_rows_read = 0;
RuntimeProfile* _profile;
bool _profile_updated = false;
};
} // namespace vectorized

View File

@ -80,6 +80,12 @@ Status ScannerScheduler::init(ExecEnv* env) {
new PriorityThreadPool(config::doris_scanner_thread_pool_thread_num,
config::doris_scanner_thread_pool_queue_size, "remote_scan"));
ThreadPoolBuilder("LimitedScanThreadPool")
.set_min_threads(config::doris_scanner_thread_pool_thread_num)
.set_max_threads(config::doris_scanner_thread_pool_thread_num)
.set_max_queue_size(config::doris_scanner_thread_pool_queue_size)
.build(&_limited_scan_thread_pool);
_is_init = true;
return Status::OK();
}
@ -94,6 +100,11 @@ Status ScannerScheduler::submit(ScannerContext* ctx) {
return Status::OK();
}
std::unique_ptr<ThreadPoolToken> ScannerScheduler::new_limited_scan_pool_token(
ThreadPool::ExecutionMode mode, int max_concurrency) {
return _limited_scan_thread_pool->new_token(mode, max_concurrency);
}
void ScannerScheduler::_schedule_thread(int queue_id) {
BlockingQueue<ScannerContext*>* queue = _pending_queues[queue_id];
while (!_is_closed) {

View File

@ -19,6 +19,7 @@
#include "common/status.h"
#include "util/blocking_queue.hpp"
#include "util/threadpool.h"
#include "vec/exec/scan/scanner_context.h"
namespace doris::vectorized {
@ -50,6 +51,9 @@ public:
Status submit(ScannerContext* ctx);
std::unique_ptr<ThreadPoolToken> new_limited_scan_pool_token(ThreadPool::ExecutionMode mode,
int max_concurrency);
private:
// scheduling thread function
void _schedule_thread(int queue_id);
@ -75,8 +79,10 @@ private:
// execution thread pool
// _local_scan_thread_pool is for local scan task(typically, olap scanner)
// _remote_scan_thread_pool is for remote scan task(cold data on s3, hdfs, etc.)
// _limited_scan_thread_pool is a special pool for queries with resource limit
std::unique_ptr<PriorityThreadPool> _local_scan_thread_pool;
std::unique_ptr<PriorityThreadPool> _remote_scan_thread_pool;
std::unique_ptr<ThreadPool> _limited_scan_thread_pool;
// true is the scheduler is closed.
std::atomic_bool _is_closed = {false};

View File

@ -45,14 +45,13 @@ using namespace ErrorCode;
VFileScanner::VFileScanner(RuntimeState* state, NewFileScanNode* parent, int64_t limit,
const TFileScanRange& scan_range, RuntimeProfile* profile,
KVCache<std::string>& kv_cache)
: VScanner(state, static_cast<VScanNode*>(parent), limit),
: VScanner(state, static_cast<VScanNode*>(parent), limit, profile),
_params(scan_range.params),
_ranges(scan_range.ranges),
_next_range(0),
_cur_reader(nullptr),
_cur_reader_eof(false),
_mem_pool(std::make_unique<MemPool>()),
_profile(profile),
_kv_cache(kv_cache),
_strict_mode(false) {
if (scan_range.params.__isset.strict_mode) {

View File

@ -45,6 +45,10 @@ public:
Status prepare(VExprContext** vconjunct_ctx_ptr,
std::unordered_map<std::string, ColumnValueRangeType>* colname_to_value_range);
doris::TabletStorageType get_storage_type() override {
return doris::TabletStorageType::STORAGE_TYPE_REMOTE;
}
protected:
Status _get_block_impl(RuntimeState* state, Block* block, bool* eof) override;
@ -106,9 +110,6 @@ protected:
// Mem pool used to allocate _src_tuple and _src_tuple_row
std::unique_ptr<MemPool> _mem_pool;
// Profile
RuntimeProfile* _profile;
KVCache<std::string>& _kv_cache;
bool _scanner_eof = false;

View File

@ -21,10 +21,11 @@
namespace doris::vectorized {
VScanner::VScanner(RuntimeState* state, VScanNode* parent, int64_t limit)
VScanner::VScanner(RuntimeState* state, VScanNode* parent, int64_t limit, RuntimeProfile* profile)
: _state(state),
_parent(parent),
_limit(limit),
_profile(profile),
_input_tuple_desc(parent->input_tuple_desc()),
_output_tuple_desc(parent->output_tuple_desc()) {
_real_tuple_desc = _input_tuple_desc != nullptr ? _input_tuple_desc : _output_tuple_desc;

View File

@ -38,7 +38,7 @@ struct ScannerCounter {
class VScanner {
public:
VScanner(RuntimeState* state, VScanNode* parent, int64_t limit);
VScanner(RuntimeState* state, VScanNode* parent, int64_t limit, RuntimeProfile* profile);
virtual ~VScanner() {}
@ -93,7 +93,7 @@ public:
int queue_id() { return _state->exec_env()->store_path_to_index("xxx"); }
doris::TabletStorageType get_storage_type() {
virtual doris::TabletStorageType get_storage_type() {
return doris::TabletStorageType::STORAGE_TYPE_LOCAL;
}
@ -137,6 +137,8 @@ protected:
// Set if scan node has sort limit info
int64_t _limit = -1;
RuntimeProfile* _profile;
const TupleDescriptor* _input_tuple_desc = nullptr;
const TupleDescriptor* _output_tuple_desc = nullptr;
const TupleDescriptor* _real_tuple_desc = nullptr;

View File

@ -36,7 +36,6 @@ TestEnv::TestEnv() {
_exec_env->_thread_mgr = new ThreadResourceMgr(2);
_exec_env->_disk_io_mgr = new DiskIoMgr(1, 1, 1, 10);
_exec_env->disk_io_mgr()->init(-1);
_exec_env->_scan_thread_pool = new PriorityThreadPool(1, 16, "ut_scan");
_exec_env->_result_queue_mgr = new ResultQueueMgr();
// TODO may need rpc support, etc.
}
@ -58,7 +57,6 @@ void TestEnv::init_buffer_pool(int64_t min_page_len, int64_t capacity, int64_t c
TestEnv::~TestEnv() {
SAFE_DELETE(_exec_env->_result_queue_mgr);
SAFE_DELETE(_exec_env->_buffer_pool);
SAFE_DELETE(_exec_env->_scan_thread_pool);
SAFE_DELETE(_exec_env->_disk_io_mgr);
SAFE_DELETE(_exec_env->_thread_mgr);