[feature-wip](resouce-group) Supports memory soft isolation of resource group (#19802)

create resource groups name properties(
    'enable_memory_overcommit' = 'true' // whether to enable memory soft isolation
)
This commit is contained in:
luozenglin
2023-05-21 19:33:57 +08:00
committed by GitHub
parent a7f3bfec89
commit 33fd965b5c
11 changed files with 167 additions and 48 deletions

View File

@ -230,7 +230,8 @@ void Daemon::memory_gc_thread() {
auto sys_mem_available = doris::MemInfo::sys_mem_available();
auto proc_mem_no_allocator_cache = doris::MemInfo::proc_mem_no_allocator_cache();
auto tg_free_mem = taskgroup::TaskGroupManager::instance()->memory_limit_gc();
// GC excess memory for resource groups that not enable overcommit
auto tg_free_mem = doris::MemInfo::tg_hard_memory_limit_gc();
sys_mem_available += tg_free_mem;
proc_mem_no_allocator_cache -= tg_free_mem;

View File

@ -487,21 +487,13 @@ int64_t MemTrackerLimiter::free_top_overcommit_query(
}
int64_t MemTrackerLimiter::tg_memory_limit_gc(
uint64_t id, const std::string& name, int64_t memory_limit,
int64_t need_free_mem, int64_t used_memory, uint64_t id, const std::string& name,
int64_t memory_limit,
std::vector<taskgroup::TgTrackerLimiterGroup>& tracker_limiter_groups) {
int64_t used_memory = 0;
for (auto& mem_tracker_group : tracker_limiter_groups) {
std::lock_guard<std::mutex> l(mem_tracker_group.group_lock);
for (const auto& tracker : mem_tracker_group.trackers) {
used_memory += tracker->consumption();
}
}
if (used_memory <= memory_limit) {
if (need_free_mem <= 0) {
return 0;
}
int64_t need_free_mem = used_memory - memory_limit;
int64_t freed_mem = 0;
constexpr auto query_type = MemTrackerLimiter::Type::QUERY;
auto cancel_str = [id, &name, memory_limit, used_memory](int64_t mem_consumption,

View File

@ -200,7 +200,8 @@ public:
}
static int64_t tg_memory_limit_gc(
uint64_t id, const std::string& name, int64_t memory_limit,
int64_t request_free_memory, int64_t used_memory, uint64_t id, const std::string& name,
int64_t memory_limit,
std::vector<taskgroup::TgTrackerLimiterGroup>& tracker_limiter_groups);
// only for Type::QUERY or Type::LOAD.

View File

@ -30,7 +30,6 @@
#include "pipeline/task_scheduler.h"
#include "runtime/exec_env.h"
#include "runtime/memory/mem_tracker_limiter.h"
#include "service/backend_options.h"
#include "util/mem_info.h"
#include "util/parse_util.h"
@ -39,6 +38,7 @@ namespace taskgroup {
const static std::string CPU_SHARE = "cpu_share";
const static std::string MEMORY_LIMIT = "memory_limit";
const static std::string ENABLE_MEMORY_OVERCOMMIT = "enable_memory_overcommit";
pipeline::PipelineTask* TaskGroupEntity::take() {
if (_queue.empty()) {
@ -81,15 +81,18 @@ TaskGroup::TaskGroup(const TaskGroupInfo& tg_info)
_name(tg_info.name),
_cpu_share(tg_info.cpu_share),
_memory_limit(tg_info.memory_limit),
_enable_memory_overcommit(tg_info.enable_memory_overcommit),
_version(tg_info.version),
_task_entity(this),
_mem_tracker_limiter_pool(MEM_TRACKER_GROUP_NUM) {}
std::string TaskGroup::debug_string() const {
std::shared_lock<std::shared_mutex> rl {_mutex};
return fmt::format("TG[id = {}, name = {}, cpu_share = {}, memory_limit = {}, version = {}]",
_id, _name, cpu_share(), PrettyPrinter::print(_memory_limit, TUnit::BYTES),
_version);
return fmt::format(
"TG[id = {}, name = {}, cpu_share = {}, memory_limit = {}, enable_memory_overcommit = "
"{}, version = {}]",
_id, _name, cpu_share(), PrettyPrinter::print(_memory_limit, TUnit::BYTES),
_enable_memory_overcommit ? "true" : "false", _version);
}
void TaskGroup::check_and_update(const TaskGroupInfo& tg_info) {
@ -108,6 +111,7 @@ void TaskGroup::check_and_update(const TaskGroupInfo& tg_info) {
_name = tg_info.name;
_version = tg_info.version;
_memory_limit = tg_info.memory_limit;
_enable_memory_overcommit = tg_info.enable_memory_overcommit;
if (_cpu_share != tg_info.cpu_share) {
ExecEnv::GetInstance()->pipeline_task_group_scheduler()->update_tg_cpu_share(
tg_info, shared_from_this());
@ -119,6 +123,17 @@ void TaskGroup::update_cpu_share_unlock(const TaskGroupInfo& tg_info) {
_cpu_share = tg_info.cpu_share;
}
int64_t TaskGroup::memory_used() {
int64_t used_memory = 0;
for (auto& mem_tracker_group : _mem_tracker_limiter_pool) {
std::lock_guard<std::mutex> l(mem_tracker_group.group_lock);
for (const auto& tracker : mem_tracker_group.trackers) {
used_memory += tracker->consumption();
}
}
return used_memory;
}
void TaskGroup::add_mem_tracker_limiter(std::shared_ptr<MemTrackerLimiter> mem_tracker_ptr) {
auto group_num = mem_tracker_ptr->group_num();
std::lock_guard<std::mutex> l(_mem_tracker_limiter_pool[group_num].group_lock);
@ -131,16 +146,14 @@ void TaskGroup::remove_mem_tracker_limiter(std::shared_ptr<MemTrackerLimiter> me
_mem_tracker_limiter_pool[group_num].trackers.erase(mem_tracker_ptr);
}
int64_t TaskGroup::memory_limit_gc() {
std::string name;
int64_t memory_limit;
{
std::shared_lock<std::shared_mutex> rl {_mutex};
name = _name;
memory_limit = _memory_limit;
}
return MemTrackerLimiter::tg_memory_limit_gc(_id, name, memory_limit,
_mem_tracker_limiter_pool);
void TaskGroup::task_group_info(TaskGroupInfo* tg_info) const {
std::shared_lock<std::shared_mutex> r_lock(_mutex);
tg_info->id = _id;
tg_info->name = _name;
tg_info->cpu_share = _cpu_share;
tg_info->memory_limit = _memory_limit;
tg_info->enable_memory_overcommit = _enable_memory_overcommit;
tg_info->version = _version;
}
Status TaskGroupInfo::parse_group_info(const TPipelineResourceGroup& resource_group,
@ -174,6 +187,12 @@ Status TaskGroupInfo::parse_group_info(const TPipelineResourceGroup& resource_gr
return Status::InternalError(ss.str());
}
task_group_info->memory_limit = mem_limit;
auto enable_memory_overcommit_iter = resource_group.properties.find(ENABLE_MEMORY_OVERCOMMIT);
task_group_info->enable_memory_overcommit =
enable_memory_overcommit_iter != resource_group.properties.end() &&
enable_memory_overcommit_iter->second ==
"true" /* fe guarantees it is 'true' or 'false' */;
return Status::OK();
}

View File

@ -87,6 +87,18 @@ public:
uint64_t id() const { return _id; }
bool enable_memory_overcommit() const {
std::shared_lock<std::shared_mutex> r_lock(_mutex);
return _enable_memory_overcommit;
};
bool memory_limit() const {
std::shared_lock<std::shared_mutex> r_lock(_mutex);
return _memory_limit;
};
int64_t memory_used();
std::string debug_string() const;
void check_and_update(const TaskGroupInfo& tg_info);
@ -97,7 +109,11 @@ public:
void remove_mem_tracker_limiter(std::shared_ptr<MemTrackerLimiter> mem_tracker_ptr);
int64_t memory_limit_gc();
void task_group_info(TaskGroupInfo* tg_info) const;
std::vector<TgTrackerLimiterGroup>& mem_tracker_limiter_pool() {
return _mem_tracker_limiter_pool;
}
private:
mutable std::shared_mutex _mutex; // lock _name, _version, _cpu_share, _memory_limit
@ -105,6 +121,7 @@ private:
std::string _name;
std::atomic<uint64_t> _cpu_share;
int64_t _memory_limit; // bytes
bool _enable_memory_overcommit;
int64_t _version;
TaskGroupEntity _task_entity;
@ -117,8 +134,9 @@ struct TaskGroupInfo {
uint64_t id;
std::string name;
uint64_t cpu_share;
int64_t version;
int64_t memory_limit;
bool enable_memory_overcommit;
int64_t version;
static Status parse_group_info(const TPipelineResourceGroup& resource_group,
TaskGroupInfo* task_group_info);

View File

@ -54,20 +54,14 @@ TaskGroupPtr TaskGroupManager::get_or_create_task_group(const TaskGroupInfo& tas
return new_task_group;
}
int64_t TaskGroupManager::memory_limit_gc() {
int64_t total_free_memory = 0;
std::vector<TaskGroupPtr> task_groups;
{
std::shared_lock<std::shared_mutex> r_lock(_group_mutex);
task_groups.reserve(_task_groups.size());
for (const auto& [id, task_group] : _task_groups) {
task_groups.push_back(task_group);
void TaskGroupManager::get_resource_groups(const std::function<bool(const TaskGroupPtr& ptr)>& pred,
std::vector<TaskGroupPtr>* task_groups) {
std::shared_lock<std::shared_mutex> r_lock(_group_mutex);
for (const auto& [id, task_group] : _task_groups) {
if (pred(task_group)) {
task_groups->push_back(task_group);
}
}
for (const auto& task_group : task_groups) {
total_free_memory += task_group->memory_limit_gc();
}
return total_free_memory;
}
} // namespace doris::taskgroup

View File

@ -33,7 +33,8 @@ public:
TaskGroupPtr get_or_create_task_group(const TaskGroupInfo& task_group_info);
int64_t memory_limit_gc();
void get_resource_groups(const std::function<bool(const TaskGroupPtr& ptr)>& pred,
std::vector<TaskGroupPtr>* task_groups);
private:
std::shared_mutex _group_mutex;

View File

@ -43,6 +43,8 @@
#include "olap/segment_loader.h"
#include "runtime/memory/chunk_allocator.h"
#include "runtime/memory/mem_tracker_limiter.h"
#include "runtime/task_group/task_group.h"
#include "runtime/task_group/task_group_manager.h"
#include "util/cgroup_util.h"
#include "util/defer_op.h"
#include "util/parse_util.h"
@ -132,7 +134,8 @@ void MemInfo::process_cache_gc(int64_t& freed_mem) {
}
// step1: free all cache
// step2: free top overcommit query, if enable query memroy overcommit
// step2: free resource groups memory that enable overcommit
// step3: free global top overcommit query, if enable query memroy overcommit
// TODO Now, the meaning is different from java minor gc + full gc, more like small gc + large gc.
bool MemInfo::process_minor_gc() {
MonotonicStopWatch watch;
@ -154,6 +157,11 @@ bool MemInfo::process_minor_gc() {
// TODO add freed_mem
SegmentLoader::instance()->prune();
freed_mem += tg_soft_memory_limit_gc(_s_process_minor_gc_size - freed_mem);
if (freed_mem > _s_process_minor_gc_size) {
return true;
}
VLOG_NOTICE << MemTrackerLimiter::type_detail_usage(
"Before free top memory overcommit query in Minor GC", MemTrackerLimiter::Type::QUERY);
if (config::enable_query_memroy_overcommit) {
@ -167,9 +175,10 @@ bool MemInfo::process_minor_gc() {
}
// step1: free all cache
// step2: free top memory query
// step3: free top overcommit load, load retries are more expensive, So cancel at the end.
// step4: free top memory load
// step2: free resource groups memory that enable overcommit
// step3: free global top memory query
// step4: free top overcommit load, load retries are more expensive, So cancel at the end.
// step5: free top memory load
bool MemInfo::process_full_gc() {
MonotonicStopWatch watch;
watch.start();
@ -197,6 +206,11 @@ bool MemInfo::process_full_gc() {
}
}
freed_mem += tg_soft_memory_limit_gc(_s_process_full_gc_size - freed_mem);
if (freed_mem > _s_process_full_gc_size) {
return true;
}
VLOG_NOTICE << MemTrackerLimiter::type_detail_usage("Before free top memory query in Full GC",
MemTrackerLimiter::Type::QUERY);
freed_mem += MemTrackerLimiter::free_top_memory_query(_s_process_full_gc_size - freed_mem,
@ -225,6 +239,68 @@ bool MemInfo::process_full_gc() {
return false;
}
int64_t MemInfo::tg_hard_memory_limit_gc() {
std::vector<taskgroup::TaskGroupPtr> task_groups;
taskgroup::TaskGroupManager::instance()->get_resource_groups(
[](const taskgroup::TaskGroupPtr& task_group) {
return !task_group->enable_memory_overcommit();
},
&task_groups);
int64_t total_free_memory = 0;
for (const auto& task_group : task_groups) {
taskgroup::TaskGroupInfo tg_info;
task_group->task_group_info(&tg_info);
auto used = task_group->memory_used();
total_free_memory += MemTrackerLimiter::tg_memory_limit_gc(
used - tg_info.memory_limit, used, tg_info.id, tg_info.name, tg_info.memory_limit,
task_group->mem_tracker_limiter_pool());
}
return total_free_memory;
}
int64_t MemInfo::tg_soft_memory_limit_gc(int64_t request_free_memory) {
std::vector<taskgroup::TaskGroupPtr> task_groups;
taskgroup::TaskGroupManager::instance()->get_resource_groups(
[](const taskgroup::TaskGroupPtr& task_group) {
return task_group->enable_memory_overcommit();
},
&task_groups);
int64_t total_exceeded_memory = 0;
std::vector<int64_t> used_memorys;
std::vector<int64_t> exceeded_memorys;
for (const auto& task_group : task_groups) {
auto used_memory = task_group->memory_used();
auto exceeded = used_memory - task_group->memory_limit();
auto exceeded_memory = exceeded > 0 ? exceeded : 0;
total_exceeded_memory += exceeded_memory;
used_memorys.emplace_back(used_memory);
exceeded_memorys.emplace_back(exceeded_memory);
}
int64_t total_free_memory = 0;
bool gc_all_exceeded = request_free_memory >= total_exceeded_memory;
for (int i = 0; i < task_groups.size(); ++i) {
if (exceeded_memorys[i] == 0) {
continue;
}
// todo: GC according to resource group priority
int64_t tg_need_free_memory =
gc_all_exceeded ? exceeded_memorys[i]
: static_cast<double>(exceeded_memorys[i]) / total_exceeded_memory *
request_free_memory /* exceeded memory as a weight */;
auto task_group = task_groups[i];
taskgroup::TaskGroupInfo tg_info;
task_group->task_group_info(&tg_info);
total_free_memory += MemTrackerLimiter::tg_memory_limit_gc(
tg_need_free_memory, used_memorys[i], tg_info.id, tg_info.name,
tg_info.memory_limit, task_group->mem_tracker_limiter_pool());
}
return total_free_memory;
}
#ifndef __APPLE__
void MemInfo::refresh_proc_meminfo() {
std::ifstream meminfo("/proc/meminfo", std::ios::in);

View File

@ -123,6 +123,10 @@ public:
static bool process_minor_gc();
static bool process_full_gc();
static int64_t tg_hard_memory_limit_gc();
static int64_t tg_soft_memory_limit_gc(int64_t request_free_memory);
// It is only used after the memory limit is exceeded. When multiple threads are waiting for the available memory of the process,
// avoid multiple threads starting at the same time and causing OOM.
static std::atomic<int64_t> refresh_interval_memory_growth;

View File

@ -46,11 +46,13 @@ public class ResourceGroup implements Writable {
public static final String MEMORY_LIMIT = "memory_limit";
public static final String ENABLE_MEMORY_OVERCOMMIT = "enable_memory_overcommit";
private static final ImmutableSet<String> REQUIRED_PROPERTIES_NAME = new ImmutableSet.Builder<String>().add(
CPU_SHARE).add(MEMORY_LIMIT).build();
private static final ImmutableSet<String> ALL_PROPERTIES_NAME = new ImmutableSet.Builder<String>().add(
CPU_SHARE).add(MEMORY_LIMIT).build();
CPU_SHARE).add(MEMORY_LIMIT).add(ENABLE_MEMORY_OVERCOMMIT).build();
@SerializedName(value = "id")
private long id;
@ -78,6 +80,9 @@ public class ResourceGroup implements Writable {
this.version = version;
String memoryLimitString = properties.get(MEMORY_LIMIT);
this.memoryLimitPercent = Double.parseDouble(memoryLimitString.substring(0, memoryLimitString.length() - 1));
if (properties.containsKey(ENABLE_MEMORY_OVERCOMMIT)) {
properties.put(ENABLE_MEMORY_OVERCOMMIT, properties.get(ENABLE_MEMORY_OVERCOMMIT).toLowerCase());
}
}
public static ResourceGroup create(String name, Map<String, String> properties) throws DdlException {
@ -129,6 +134,13 @@ public class ResourceGroup implements Writable {
LOG.debug(memLimitErr, e);
throw new DdlException(memLimitErr);
}
if (properties.containsKey(ENABLE_MEMORY_OVERCOMMIT)) {
String value = properties.get(ENABLE_MEMORY_OVERCOMMIT).toLowerCase();
if (!("true".equals(value) || "false".equals(value))) {
throw new DdlException("The value of '" + ENABLE_MEMORY_OVERCOMMIT + "' must be true or false.");
}
}
}
public long getId() {

View File

@ -123,7 +123,8 @@ public class ResourceGroupMgr implements Writable, GsonPostProcessable {
}
Map<String, String> properties = Maps.newHashMap();
properties.put(ResourceGroup.CPU_SHARE, "10");
properties.put(ResourceGroup.MEMORY_LIMIT, "100%");
properties.put(ResourceGroup.MEMORY_LIMIT, "30%");
properties.put(ResourceGroup.ENABLE_MEMORY_OVERCOMMIT, "true");
defaultResourceGroup = ResourceGroup.create(DEFAULT_GROUP_NAME, properties);
nameToResourceGroup.put(DEFAULT_GROUP_NAME, defaultResourceGroup);
idToResourceGroup.put(defaultResourceGroup.getId(), defaultResourceGroup);