add a tenant parameter to restrict tmp file disk usage

This commit is contained in:
dongb0 2025-01-03 12:15:23 +00:00 committed by ob-robot
parent 9f87f66ded
commit 9fc5197ec8
11 changed files with 120 additions and 25 deletions

View File

@ -547,15 +547,14 @@ DEF_INT(_pushdown_storage_level, OB_TENANT_PARAMETER, "4", "[0, 4]",
ObParameterAttr(Section::TENANT, Source::DEFAULT, EditLevel::DYNAMIC_EFFECTIVE));
DEF_WORK_AREA_POLICY(workarea_size_policy, OB_TENANT_PARAMETER, "AUTO", "policy used to size SQL working areas (MANUAL/AUTO)",
ObParameterAttr(Section::TENANT, Source::DEFAULT, EditLevel::DYNAMIC_EFFECTIVE));
DEF_CAP(temporary_file_max_disk_size, OB_TENANT_PARAMETER, "0M", "[0,)",
"maximum disk usage of temporary file on a single node, 0 means no limit. "
"Range: [0,+∞)",
ObParameterAttr(Section::TENANT, Source::DEFAULT, EditLevel::DYNAMIC_EFFECTIVE));
DEF_INT(_temporary_file_io_area_size, OB_TENANT_PARAMETER, "1", "[0, 50)",
"memory buffer size of temporary file, as a percentage of total tenant memory. "
"Range: [0, 50), percentage",
ObParameterAttr(Section::TENANT, Source::DEFAULT, EditLevel::DYNAMIC_EFFECTIVE));
DEF_INT(_temporary_file_meta_memory_limit_percentage, OB_TENANT_PARAMETER, "0", "[0,100]",
"The memory limit of temporary file meta, and the value is a percentage of the tenant's memory. "
"The default value is 70. For compatibility, 0 is 70% of tenant memory."
"Range: [0, 100], percentage",
ObParameterAttr(Section::TENANT, Source::DEFAULT, EditLevel::DYNAMIC_EFFECTIVE));
DEF_INT(_storage_meta_memory_limit_percentage, OB_TENANT_PARAMETER, "20", "[0, 50)",
"maximum memory for storage meta, as a percentage of total tenant memory. "
"Range: [0, 50), percentage, 0 means no limit to storage meta memory",

View File

@ -209,6 +209,7 @@ ObITmpFile::ObITmpFile()
begin_page_id_(ObTmpFileGlobal::INVALID_PAGE_ID),
begin_page_virtual_id_(ObTmpFileGlobal::INVALID_VIRTUAL_PAGE_ID),
end_page_id_(ObTmpFileGlobal::INVALID_PAGE_ID),
diag_log_print_cnt_(0),
data_page_flush_level_(-1),
data_flush_node_(*this),
meta_lock_(common::ObLatchIds::TMP_FILE_LOCK),
@ -319,6 +320,7 @@ void ObITmpFile::reset()
begin_page_id_ = ObTmpFileGlobal::INVALID_PAGE_ID;
begin_page_virtual_id_ = ObTmpFileGlobal::INVALID_VIRTUAL_PAGE_ID;
end_page_id_ = ObTmpFileGlobal::INVALID_PAGE_ID;
diag_log_print_cnt_ = 0;
data_page_flush_level_ = -1;
data_flush_node_.unlink();
wbp_ = nullptr;
@ -623,7 +625,15 @@ int ObITmpFile::aio_write(ObTmpFileIOCtx &io_ctx)
}
if (OB_SUCC(ret)) {
LOG_DEBUG("aio write finish", KR(ret), K(fd_), K(file_size_), K(io_ctx));
// ATTENTION! we print tmp file data members here without meta_lock_.
static const int64_t PRINT_LOG_FILE_SIZE = 100 * 1024 * 1024; // 100MB
int64_t cur_print_cnt = file_size_ / PRINT_LOG_FILE_SIZE;
if (cur_print_cnt > ATOMIC_LOAD(&diag_log_print_cnt_)) {
ATOMIC_INC(&diag_log_print_cnt_);
LOG_INFO("aio write finish", K(fd_), K(io_ctx), KPC(this));
} else {
LOG_DEBUG("aio write finish", KR(ret), K(fd_), K(file_size_), K(io_ctx));
}
} else {
LOG_DEBUG("aio write failed", KR(ret), K(fd_), K(file_size_), K(io_ctx), KPC(this));
}

View File

@ -332,6 +332,7 @@ protected:
uint32_t begin_page_id_; // the first page index in write buffer pool
int64_t begin_page_virtual_id_;
uint32_t end_page_id_; // the last page index in write buffer pool
int64_t diag_log_print_cnt_; // print diagnosis log every PRINT_LOG_FILE_SIZE
int64_t data_page_flush_level_;
ObTmpFileNode data_flush_node_;
common::TCRWLock meta_lock_; // handle conflicts between writing and reading meta tree and meta data of file

View File

@ -381,7 +381,7 @@ int ObITenantTmpFileManager::aio_write(const uint64_t tenant_id,
LOG_WARN("ObITenantTmpFileManager has not been inited", KR(ret), K(tenant_id_));
} else if (OB_UNLIKELY(!io_info.is_valid())) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("fail to aio read, invalid argument", KR(ret), K(io_info));
LOG_WARN("fail to aio write, invalid argument", KR(ret), K(io_info));
} else if (OB_UNLIKELY(MTL_ID() != tenant_id)) {
ret = OB_INVALID_ARGUMENT;
LOG_WARN("tenant id not match", KR(ret), K(tenant_id), K(MTL_ID()));

View File

@ -823,7 +823,7 @@ void ObTmpFileBlockManager::print_block_usage()
int64_t occupied_page_num = block_num * ObTmpFileGlobal::BLOCK_PAGE_NUMS;
double disk_fragment_ratio = static_cast<double>(used_page_num) / static_cast<double>(occupied_page_num);
LOG_INFO("the block usage for temporary files",
K(used_page_num), K(occupied_page_num), K(disk_fragment_ratio));
K(used_page_num), K(occupied_page_num), K(block_num), K(disk_fragment_ratio));
}
}

View File

@ -253,6 +253,7 @@ void ObTmpFileFlushManager::inner_advance_flush_level_without_checking_watermark
int ObTmpFileFlushManager::flush(ObSpLinkQueue &flushing_queue,
ObTmpFileFlushMonitor &flush_monitor,
const int64_t expect_flush_size,
const int64_t current_flush_cnt,
const bool is_flush_meta_tree)
{
int ret = OB_SUCCESS;
@ -265,14 +266,14 @@ int ObTmpFileFlushManager::flush(ObSpLinkQueue &flushing_queue,
} else if (OB_FAIL(flush_ctx_.prepare_flush_ctx(expect_flush_size, &flush_priority_mgr_, &flush_monitor))) {
STORAGE_LOG(WARN, "fail to prepare flush iterator", KR(ret), K(flush_ctx_));
} else {
if (OB_FAIL(flush_by_watermark_(flushing_queue, is_flush_meta_tree))) {
if (OB_FAIL(flush_by_watermark_(flushing_queue, current_flush_cnt, is_flush_meta_tree))) {
STORAGE_LOG(DEBUG, "fail to flush by watermark", KR(ret), K(flush_ctx_));
}
if (!flushing_queue.is_empty()) {
STORAGE_LOG(DEBUG, "ObTmpFileFlushManager flush finish", KR(ret), K(fast_flush_meta), K(flush_ctx_));
}
if (OB_FAIL(ret) && !flushing_queue.is_empty()) {
if (OB_FAIL(ret) && ret != OB_TMP_FILE_EXCEED_DISK_QUOTA && !flushing_queue.is_empty()) {
ret = OB_SUCCESS; // ignore error if generate at least 1 task
}
flush_ctx_.clear_flush_ctx(flush_priority_mgr_);
@ -280,14 +281,19 @@ int ObTmpFileFlushManager::flush(ObSpLinkQueue &flushing_queue,
return ret;
}
int ObTmpFileFlushManager::flush_by_watermark_(ObSpLinkQueue &flushing_queue, const bool is_flush_meta_tree)
int ObTmpFileFlushManager::flush_by_watermark_(ObSpLinkQueue &flushing_queue,
const int64_t current_flush_cnt,
const bool is_flush_meta_tree)
{
int ret = OB_SUCCESS;
bool fast_flush_meta = is_flush_meta_tree;
int64_t flushing_cnt = current_flush_cnt;
while (OB_SUCC(ret) && !flush_ctx_.is_fail_too_many()
&& (FlushCtxState::FSM_FINISHED != flush_ctx_.get_state() || fast_flush_meta)) {
ObTmpFileFlushTask *flush_task = nullptr;
if (OB_FAIL(handle_alloc_flush_task_(fast_flush_meta, flush_task))) {
if (OB_FAIL(check_tmp_file_disk_usage_limit_(flushing_cnt++))) {
STORAGE_LOG(WARN, "tmp file exceeds disk quota", KR(ret), K(current_flush_cnt), K(is_flush_meta_tree));
} else if (OB_FAIL(handle_alloc_flush_task_(fast_flush_meta, flush_task))) {
STORAGE_LOG(WARN, "fail to alloc flush task", KR(ret), K(flush_ctx_));
} else {
flush_ctx_.inc_create_flush_task_cnt();
@ -341,6 +347,29 @@ int ObTmpFileFlushManager::flush_by_watermark_(ObSpLinkQueue &flushing_queue, co
return ret;
}
int ObTmpFileFlushManager::check_tmp_file_disk_usage_limit_(const int64_t current_flush_cnt)
{
int ret = OB_SUCCESS;
if (IS_NOT_INIT) {
ret = OB_NOT_INIT;
LOG_WARN("tmp file page cache controller is not inited", KR(ret));
} else {
int64_t disk_usage_limit = pc_ctrl_.get_disk_usage_limit();
int64_t used_page_num = 0;
int64_t tmp_file_block_num = 0;
int64_t current_disk_usage = 0;
if (OB_FAIL(tmp_file_block_mgr_.get_block_usage_stat(used_page_num, tmp_file_block_num))) {
STORAGE_LOG(WARN, "fail to get tmp file block usage stat", KR(ret));
} else if (FALSE_IT(current_disk_usage = (tmp_file_block_num + current_flush_cnt) * ObTmpFileGlobal::SN_BLOCK_SIZE)) {
} else if (disk_usage_limit > 0 && current_disk_usage > disk_usage_limit) {
ret = OB_TMP_FILE_EXCEED_DISK_QUOTA;
STORAGE_LOG(WARN, "tmp file exceeds disk usage limit",
KR(ret), K(current_disk_usage), K(disk_usage_limit), K(tmp_file_block_num), K(current_flush_cnt));
}
}
return ret;
}
// skip flush level, copy meta tree pages directly
int ObTmpFileFlushManager::fast_fill_block_buf_with_meta_(ObTmpFileFlushTask &flush_task)
{

View File

@ -79,12 +79,14 @@ public:
int flush(ObSpLinkQueue &flushing_queue,
ObTmpFileFlushMonitor &flush_monitor,
const int64_t expect_flush_size,
const int64_t current_flush_cnt,
const bool is_flush_meta_tree);
int retry(ObTmpFileFlushTask &flush_task);
int io_finished(ObTmpFileFlushTask &flush_task);
int update_file_meta_after_flush(ObTmpFileFlushTask &flush_task);
void try_remove_unused_file_flush_ctx();
private:
int check_tmp_file_disk_usage_limit_(const int64_t current_flushing_cnt);
int fill_block_buf_(ObTmpFileFlushTask &flush_task);
int fast_fill_block_buf_with_meta_(ObTmpFileFlushTask &flush_task);
int inner_fill_block_buf_(ObTmpFileFlushTask &flush_task,
@ -110,7 +112,9 @@ private:
int handle_wait_(ObTmpFileFlushTask &flush_task, FlushState &next_state);
int handle_finish_(ObTmpFileFlushTask &flush_task);
private:
int flush_by_watermark_(ObSpLinkQueue &flushing_queue, const bool is_flush_meta_tree);
int flush_by_watermark_(ObSpLinkQueue &flushing_queue,
const int64_t current_flush_cnt,
const bool is_flush_meta_tree);
int update_meta_data_after_flush_for_files_(ObTmpFileFlushTask &flush_task);
int reset_flush_ctx_for_file_(const ObSharedNothingTmpFile *file, const bool is_meta);
int get_or_create_file_in_ctx_(const int64_t fd, ObTmpFileSingleFlushContext &file_flush_ctx);

View File

@ -127,6 +127,25 @@ int ObTmpFilePageCacheController::free_swap_job_(ObTmpFileSwapJob *swap_job)
return ret;
}
// refresh tmp file disk usage limit from tenant config with timeout 10ms
void ObTmpFilePageCacheController::refresh_disk_usage_limit()
{
int ret = OB_SUCCESS;
if (IS_NOT_INIT) {
ret = OB_NOT_INIT;
LOG_WARN("tmp file page cache controller is not inited", KR(ret));
} else {
omt::ObTenantConfigGuard config(TENANT_CONF_TIL(MTL_ID(), ACCESS_TENANT_CONFIG_TIMEOUT_US));
if (!config.is_valid()) {
// do nothing
} else {
const int64_t max_disk_usage = config->temporary_file_max_disk_size;
int64_t disk_limit = max_disk_usage > 0 ? max_disk_usage : 0;
ATOMIC_SET(&disk_usage_limit_, disk_limit);
}
}
}
int ObTmpFilePageCacheController::invoke_swap_and_wait(int64_t expect_swap_size, int64_t timeout_ms)
{
int ret = OB_SUCCESS;
@ -138,7 +157,10 @@ int ObTmpFilePageCacheController::invoke_swap_and_wait(int64_t expect_swap_size,
void *task_buf = nullptr;
ObTmpFileSwapJob *swap_job = nullptr;
if (OB_ISNULL(task_buf = task_allocator_.alloc(sizeof(ObTmpFileSwapJob)))) {
if (IS_NOT_INIT) {
ret = OB_NOT_INIT;
STORAGE_LOG(WARN, "tmp file page cache controller is not inited", KR(ret));
} else if (OB_ISNULL(task_buf = task_allocator_.alloc(sizeof(ObTmpFileSwapJob)))) {
ret = OB_ALLOCATE_MEMORY_FAILED;
STORAGE_LOG(WARN, "fail to allocate memory for swap job", KR(ret));
} else if (FALSE_IT(swap_job = new (task_buf) ObTmpFileSwapJob())) {

View File

@ -26,6 +26,7 @@ public:
ObTmpFilePageCacheController(ObTmpFileBlockManager &tmp_file_block_manager)
: is_inited_(false),
flush_all_data_(false),
disk_usage_limit_(0),
tmp_file_block_manager_(tmp_file_block_manager),
task_allocator_(),
write_buffer_pool_(),
@ -42,6 +43,8 @@ public:
static const int64_t FLUSH_INTERVAL = 1000; // 1s
static const int64_t SWAP_FAST_INTERVAL = 5; // 5ms
static const int64_t SWAP_INTERVAL = 1000; // 1s
static const int64_t REFRESH_CONFIG_INTERVAL = 10 * 1000 * 1000; // 10s
static const int64_t ACCESS_TENANT_CONFIG_TIMEOUT_US = 10 * 1000; // 10ms
virtual int init();
int start();
void stop();
@ -55,7 +58,9 @@ public:
ObTmpFileBlockManager &get_tmp_file_block_manager() { return tmp_file_block_manager_; }
OB_INLINE bool is_flush_all_data() { return ATOMIC_LOAD(&flush_all_data_); }
OB_INLINE void set_flush_all_data(bool flush_all_data) { ATOMIC_STORE(&flush_all_data_, flush_all_data); }
OB_INLINE int64_t get_disk_usage_limit() const { return ATOMIC_LOAD(&disk_usage_limit_); }
virtual int invoke_swap_and_wait(int64_t expect_swap_size, int64_t timeout_ms = ObTmpFileSwapJob::DEFAULT_TIMEOUT_MS);
void refresh_disk_usage_limit();
private:
int swap_job_enqueue_(ObTmpFileSwapJob *swap_job);
int free_swap_job_(ObTmpFileSwapJob *swap_job);
@ -63,6 +68,7 @@ private:
private:
bool is_inited_;
bool flush_all_data_; // set to true to flush all pages when shrinking write buffer pool
int64_t disk_usage_limit_; // periodically read disk usage limit from tenant config
ObTmpFileBlockManager &tmp_file_block_manager_; // ref to ObTmpFileBlockManager
ObFIFOAllocator task_allocator_; // used by flush_mgr_ to allocate flush tasks
ObTmpWriteBufferPool write_buffer_pool_;

View File

@ -424,11 +424,20 @@ int ObTmpFileFlushTG::wash_(const int64_t expect_flush_size, const RUNNING_MODE
{
int ret = OB_SUCCESS;
int64_t flushing_task_cnt = 0;
int64_t current_flush_cnt = ATOMIC_LOAD(&flushing_block_num_);
ObSpLinkQueue flushing_list;
if (OB_FAIL(flush_mgr_.flush(flushing_list, flush_monitor_, expect_flush_size, is_fast_flush_meta_))) {
STORAGE_LOG(WARN, "flush mgr fail to do flush", KR(ret), KPC(this));
} else if (OB_FAIL(handle_generated_flush_tasks_(flushing_list, flushing_task_cnt))) {
STORAGE_LOG(WARN, "fail to handle generated flush tasks", KR(ret), K(flushing_task_cnt), KPC(this));
if (OB_FAIL(flush_mgr_.flush(flushing_list, flush_monitor_, expect_flush_size,
current_flush_cnt, is_fast_flush_meta_))) {
if (OB_TMP_FILE_EXCEED_DISK_QUOTA == ret) {
signal_io_finish(ret);
}
}
if (!flushing_list.is_empty()) { // ignore ret
if (OB_FAIL(handle_generated_flush_tasks_(flushing_list, flushing_task_cnt))) {
STORAGE_LOG(WARN, "fail to handle generated flush tasks", KR(ret), K(flushing_task_cnt), KPC(this));
}
}
bool idle_loop = flushing_task_cnt == 0;
@ -530,11 +539,21 @@ int ObTmpFileFlushTG::special_flush_meta_tree_page_()
int ret = OB_SUCCESS;
ObSpLinkQueue flushing_list;
int64_t flushing_task_cnt = 0;
int64_t current_flush_cnt = ATOMIC_LOAD(&flushing_block_num_);
int64_t expect_flush_size = OB_STORAGE_OBJECT_MGR.get_macro_object_size();
if (OB_FAIL(flush_mgr_.flush(flushing_list, flush_monitor_, expect_flush_size, true/*is_flush_meta_tree*/))) {
STORAGE_LOG(ERROR, "flush mgr fail to do fast flush meta tree page", KR(ret), KPC(this));
} else if (OB_FAIL(handle_generated_flush_tasks_(flushing_list, flushing_task_cnt))) {
STORAGE_LOG(WARN, "fail to handle fast flush meta tasks", KR(ret), K(flushing_task_cnt), KPC(this));
if (OB_FAIL(flush_mgr_.flush(flushing_list, flush_monitor_, expect_flush_size,
current_flush_cnt, true/*is_flush_meta_tree*/))) {
if (OB_TMP_FILE_EXCEED_DISK_QUOTA == ret) {
signal_io_finish(ret);
} else {
STORAGE_LOG(ERROR, "flush mgr fail to do fast flush meta tree page", KR(ret), KPC(this));
}
}
if (!flushing_list.is_empty()) { // ignore ret
if (OB_FAIL(handle_generated_flush_tasks_(flushing_list, flushing_task_cnt))) {
STORAGE_LOG(WARN, "fail to handle fast flush meta tasks", KR(ret), K(flushing_task_cnt), KPC(this));
}
}
return ret;
}
@ -971,6 +990,10 @@ int ObTmpFileSwapTG::do_work_()
}
if (OB_SUCC(ret)) {
if (TC_REACH_TIME_INTERVAL(ObTmpFilePageCacheController::REFRESH_CONFIG_INTERVAL)) {
pc_ctrl_.refresh_disk_usage_limit();
}
if (OB_FAIL(shrink_wbp_if_needed_())) {
STORAGE_LOG(WARN, "fail to flush for shrinking wbp", KR(ret), KPC(this));
}
@ -1047,8 +1070,9 @@ int ObTmpFileSwapTG::swap_fast_()
wakeup_satisfied_jobs_(wakeup_job_cnt);
wakeup_timeout_jobs_();
int io_finished_ret = flush_tg_ref_.get_flush_io_finished_ret();
if (OB_SERVER_OUTOF_DISK_SPACE == io_finished_ret) {
wakeup_all_jobs_(OB_SERVER_OUTOF_DISK_SPACE);
if (OB_SERVER_OUTOF_DISK_SPACE == io_finished_ret ||
OB_TMP_FILE_EXCEED_DISK_QUOTA == io_finished_ret) {
wakeup_all_jobs_(io_finished_ret);
}
// do flush if could not evict enough pages

View File

@ -278,6 +278,7 @@ tcp_keepcnt
tcp_keepidle
tcp_keepintvl
tde_method
temporary_file_max_disk_size
tenant_sql_login_thread_count
tenant_sql_net_thread_count
tenant_task_queue_size
@ -523,7 +524,6 @@ _storage_meta_memory_limit_percentage
_stream_rpc_max_wait_timeout
_system_tenant_limit_mode
_temporary_file_io_area_size
_temporary_file_meta_memory_limit_percentage
_trace_control_info
_transfer_finish_trans_timeout
_transfer_process_lock_tx_timeout