After the memory exceeds the limit, the previous query waited for memory free in the mem hook, and changed it to wait in the Allocator. more controllable and safe
388 lines
16 KiB
C++
388 lines
16 KiB
C++
// Licensed to the Apache Software Foundation (ASF) under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing,
|
|
// software distributed under the License is distributed on an
|
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
// KIND, either express or implied. See the License for the
|
|
// specific language governing permissions and limitations
|
|
// under the License.
|
|
// This file is copied from
|
|
// https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/Allocator.h
|
|
// and modified by Doris
|
|
|
|
#pragma once
|
|
|
|
// TODO: Readable
|
|
|
|
#include <fmt/format.h>
|
|
#include <string.h>
|
|
|
|
#include "common/config.h"
|
|
#include "common/status.h"
|
|
#include "runtime/memory/chunk.h"
|
|
#include "runtime/memory/chunk_allocator.h"
|
|
#include "runtime/thread_context.h"
|
|
|
|
#ifdef NDEBUG
|
|
#define ALLOCATOR_ASLR 0
|
|
#else
|
|
#define ALLOCATOR_ASLR 1
|
|
#endif
|
|
|
|
#if !defined(__APPLE__) && !defined(__FreeBSD__)
|
|
#include <malloc.h>
|
|
#else
|
|
#define _DARWIN_C_SOURCE
|
|
#endif
|
|
|
|
#include <sys/mman.h>
|
|
|
|
#include <algorithm>
|
|
#include <cstdlib>
|
|
|
|
#include "common/compiler_util.h"
|
|
#ifdef THREAD_SANITIZER
|
|
/// Thread sanitizer does not intercept mremap. The usage of mremap will lead to false positives.
|
|
#define DISABLE_MREMAP 1
|
|
#endif
|
|
#include "common/exception.h"
|
|
#include "vec/common/allocator_fwd.h"
|
|
#include "vec/common/mremap.h"
|
|
|
|
/// Required for older Darwin builds, that lack definition of MAP_ANONYMOUS
|
|
#ifndef MAP_ANONYMOUS
|
|
#define MAP_ANONYMOUS MAP_ANON
|
|
#endif
|
|
|
|
#ifdef NDEBUG
|
|
/**
|
|
* Many modern allocators (for example, tcmalloc) do not do a mremap for
|
|
* realloc, even in case of large enough chunks of memory. Although this allows
|
|
* you to increase performance and reduce memory consumption during realloc.
|
|
* To fix this, we do mremap manually if the chunk of memory is large enough.
|
|
* The threshold (64 MB) is chosen quite large, since changing the address
|
|
* space is very slow, especially in the case of a large number of threads. We
|
|
* expect that the set of operations mmap/something to do/mremap can only be
|
|
* performed about 1000 times per second.
|
|
*
|
|
* P.S. This is also required, because tcmalloc can not allocate a chunk of
|
|
* memory greater than 16 GB.
|
|
*/
|
|
static constexpr size_t MMAP_THRESHOLD = 64 * (1ULL << 20);
|
|
/**
|
|
* Memory allocation between 4KB and 64MB will be through ChunkAllocator,
|
|
* those less than 4KB will be through malloc (for example, tcmalloc),
|
|
* and those greater than 64MB will be through MMAP.
|
|
* In the actual test, chunkallocator allocates less than 4KB of memory slower than malloc,
|
|
* and chunkallocator allocates more than 64MB of memory slower than MMAP,
|
|
* but the 4KB threshold is an empirical value, which needs to be determined
|
|
* by more detailed test later.
|
|
*/
|
|
static constexpr size_t CHUNK_THRESHOLD = 4096;
|
|
#else
|
|
/**
|
|
* In debug build, use small mmap threshold to reproduce more memory
|
|
* stomping bugs. Along with ASLR it will hopefully detect more issues than
|
|
* ASan. The program may fail due to the limit on number of memory mappings.
|
|
*/
|
|
static constexpr size_t MMAP_THRESHOLD = 4096;
|
|
static constexpr size_t CHUNK_THRESHOLD = 1024;
|
|
#endif
|
|
|
|
static constexpr size_t MMAP_MIN_ALIGNMENT = 4096;
|
|
static constexpr size_t MALLOC_MIN_ALIGNMENT = 8;
|
|
|
|
#define RETURN_BAD_ALLOC(err) \
|
|
do { \
|
|
LOG(WARNING) << err; \
|
|
if (!doris::enable_thread_catch_bad_alloc) \
|
|
doris::MemTrackerLimiter::print_log_process_usage(err); \
|
|
throw std::bad_alloc {}; \
|
|
} while (0)
|
|
|
|
#define RETURN_BAD_ALLOC_IF_PRE_CATCH(err) \
|
|
do { \
|
|
LOG(WARNING) << err; \
|
|
if (!doris::enable_thread_catch_bad_alloc) { \
|
|
doris::MemTrackerLimiter::print_log_process_usage(err); \
|
|
} else { \
|
|
throw std::bad_alloc {}; \
|
|
} \
|
|
} while (0)
|
|
|
|
/** Responsible for allocating / freeing memory. Used, for example, in PODArray, Arena.
|
|
* Also used in hash tables.
|
|
* The interface is different from std::allocator
|
|
* - the presence of the method realloc, which for large chunks of memory uses mremap;
|
|
* - passing the size into the `free` method;
|
|
* - by the presence of the `alignment` argument;
|
|
* - the possibility of zeroing memory (used in hash tables);
|
|
* - random hint address for mmap
|
|
* - mmap_threshold for using mmap less or more
|
|
*/
|
|
template <bool clear_memory_, bool mmap_populate>
|
|
class Allocator {
|
|
public:
|
|
void sys_memory_check(size_t size) {
|
|
if (doris::MemTrackerLimiter::sys_mem_exceed_limit_check(size)) {
|
|
if (doris::thread_context()->thread_mem_tracker_mgr->is_attach_query() &&
|
|
doris::thread_context()->thread_mem_tracker_mgr->wait_gc()) {
|
|
int64_t wait_milliseconds = doris::config::thread_wait_gc_max_milliseconds;
|
|
while (wait_milliseconds > 0) {
|
|
std::this_thread::sleep_for(std::chrono::milliseconds(100));
|
|
if (!doris::MemTrackerLimiter::sys_mem_exceed_limit_check(size)) {
|
|
doris::MemInfo::refresh_interval_memory_growth += size;
|
|
break;
|
|
}
|
|
wait_milliseconds -= 100;
|
|
}
|
|
if (wait_milliseconds <= 0) {
|
|
auto err_msg = fmt::format(
|
|
"Allocator Sys Memory Check Failed In Query/Load: Cannot alloc {}, "
|
|
"{}.",
|
|
size,
|
|
doris::MemTrackerLimiter::process_limit_exceeded_errmsg_str(size));
|
|
doris::thread_context()->thread_mem_tracker_mgr->disable_wait_gc();
|
|
if (!doris::enable_thread_catch_bad_alloc) {
|
|
doris::thread_context()->thread_mem_tracker_mgr->cancel_fragment(err_msg);
|
|
} else {
|
|
LOG(WARNING) << err_msg;
|
|
throw std::bad_alloc {};
|
|
}
|
|
}
|
|
} else if (doris::enable_thread_catch_bad_alloc) {
|
|
LOG(WARNING) << fmt::format(
|
|
"Allocator Sys Memory Check Failed: Cannot alloc {}, {}.", size,
|
|
doris::MemTrackerLimiter::process_limit_exceeded_errmsg_str(size));
|
|
throw std::bad_alloc {};
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Allocate memory range.
|
|
void* alloc(size_t size, size_t alignment = 0) {
|
|
sys_memory_check(size);
|
|
void* buf;
|
|
|
|
if (size >= MMAP_THRESHOLD) {
|
|
if (alignment > MMAP_MIN_ALIGNMENT)
|
|
throw doris::Exception(
|
|
doris::ErrorCode::INVALID_ARGUMENT,
|
|
"Too large alignment {}: more than page size when allocating {}.",
|
|
alignment, size);
|
|
|
|
if (!TRY_CONSUME_THREAD_MEM_TRACKER(size)) {
|
|
RETURN_BAD_ALLOC_IF_PRE_CATCH(
|
|
fmt::format("Allocator Pre Catch: Cannot mmap {}.", size));
|
|
// memory exceeds the limit, consume mem tracker fails, but there is no external catch bad_alloc,
|
|
// alloc will continue to execute, so the consume memtracker is forced.
|
|
CONSUME_THREAD_MEM_TRACKER(size);
|
|
}
|
|
buf = mmap(get_mmap_hint(), size, PROT_READ | PROT_WRITE, mmap_flags, -1, 0);
|
|
if (MAP_FAILED == buf) {
|
|
RELEASE_THREAD_MEM_TRACKER(size);
|
|
RETURN_BAD_ALLOC(fmt::format("Allocator: Cannot mmap {}.", size));
|
|
}
|
|
|
|
/// No need for zero-fill, because mmap guarantees it.
|
|
} else if (!doris::config::disable_chunk_allocator_in_vec && size >= CHUNK_THRESHOLD) {
|
|
doris::Chunk chunk;
|
|
if (!doris::ChunkAllocator::instance()->allocate_align(size, &chunk)) {
|
|
RETURN_BAD_ALLOC(fmt::format("Allocator: Cannot allocate chunk {}.", size));
|
|
}
|
|
buf = chunk.data;
|
|
if constexpr (clear_memory) memset(buf, 0, chunk.size);
|
|
} else {
|
|
if (alignment <= MALLOC_MIN_ALIGNMENT) {
|
|
if constexpr (clear_memory)
|
|
buf = ::calloc(size, 1);
|
|
else
|
|
buf = ::malloc(size);
|
|
|
|
if (nullptr == buf) {
|
|
RETURN_BAD_ALLOC(fmt::format("Allocator: Cannot malloc {}.", size));
|
|
}
|
|
} else {
|
|
buf = nullptr;
|
|
int res = posix_memalign(&buf, alignment, size);
|
|
|
|
if (0 != res) {
|
|
RETURN_BAD_ALLOC(
|
|
fmt::format("Cannot allocate memory (posix_memalign) {}.", size));
|
|
}
|
|
|
|
if constexpr (clear_memory) memset(buf, 0, size);
|
|
}
|
|
}
|
|
return buf;
|
|
}
|
|
|
|
/// Free memory range.
|
|
void free(void* buf, size_t size) {
|
|
if (size >= MMAP_THRESHOLD) {
|
|
if (0 != munmap(buf, size)) {
|
|
auto err = fmt::format("Allocator: Cannot munmap {}.", size);
|
|
LOG(ERROR) << err;
|
|
if (!doris::enable_thread_catch_bad_alloc)
|
|
doris::MemTrackerLimiter::print_log_process_usage(err);
|
|
throw std::bad_alloc {};
|
|
} else {
|
|
RELEASE_THREAD_MEM_TRACKER(size);
|
|
}
|
|
} else if (!doris::config::disable_chunk_allocator_in_vec && size >= CHUNK_THRESHOLD &&
|
|
((size & (size - 1)) == 0)) {
|
|
// Only power-of-two length are added to ChunkAllocator
|
|
doris::ChunkAllocator::instance()->free((uint8_t*)buf, size);
|
|
} else {
|
|
::free(buf);
|
|
}
|
|
}
|
|
|
|
/** Enlarge memory range.
|
|
* Data from old range is moved to the beginning of new range.
|
|
* Address of memory range could change.
|
|
*/
|
|
void* realloc(void* buf, size_t old_size, size_t new_size, size_t alignment = 0) {
|
|
if (old_size == new_size) {
|
|
/// nothing to do.
|
|
/// BTW, it's not possible to change alignment while doing realloc.
|
|
} else if (old_size < CHUNK_THRESHOLD && new_size < CHUNK_THRESHOLD &&
|
|
alignment <= MALLOC_MIN_ALIGNMENT) {
|
|
sys_memory_check(new_size);
|
|
/// Resize malloc'd memory region with no special alignment requirement.
|
|
void* new_buf = ::realloc(buf, new_size);
|
|
if (nullptr == new_buf) {
|
|
RETURN_BAD_ALLOC(fmt::format("Allocator: Cannot realloc from {} to {}.", old_size,
|
|
new_size));
|
|
}
|
|
|
|
buf = new_buf;
|
|
if constexpr (clear_memory)
|
|
if (new_size > old_size)
|
|
memset(reinterpret_cast<char*>(buf) + old_size, 0, new_size - old_size);
|
|
} else if (old_size >= MMAP_THRESHOLD && new_size >= MMAP_THRESHOLD) {
|
|
sys_memory_check(new_size);
|
|
/// Resize mmap'd memory region.
|
|
if (!TRY_CONSUME_THREAD_MEM_TRACKER(new_size - old_size)) {
|
|
RETURN_BAD_ALLOC_IF_PRE_CATCH(fmt::format(
|
|
"Allocator Pre Catch: Cannot mremap memory chunk from {} to {}.", old_size,
|
|
new_size));
|
|
CONSUME_THREAD_MEM_TRACKER(new_size - old_size);
|
|
}
|
|
|
|
// On apple and freebsd self-implemented mremap used (common/mremap.h)
|
|
buf = clickhouse_mremap(buf, old_size, new_size, MREMAP_MAYMOVE, PROT_READ | PROT_WRITE,
|
|
mmap_flags, -1, 0);
|
|
if (MAP_FAILED == buf) {
|
|
RELEASE_THREAD_MEM_TRACKER(new_size - old_size);
|
|
RETURN_BAD_ALLOC(fmt::format("Allocator: Cannot mremap memory chunk from {} to {}.",
|
|
old_size, new_size));
|
|
}
|
|
|
|
/// No need for zero-fill, because mmap guarantees it.
|
|
|
|
if constexpr (mmap_populate) {
|
|
// MAP_POPULATE seems have no effect for mremap as for mmap,
|
|
// Clear enlarged memory range explicitly to pre-fault the pages
|
|
if (new_size > old_size)
|
|
memset(reinterpret_cast<char*>(buf) + old_size, 0, new_size - old_size);
|
|
}
|
|
} else {
|
|
sys_memory_check(new_size);
|
|
// CHUNK_THRESHOLD <= old_size <= MMAP_THRESHOLD use system realloc is slow, use ChunkAllocator.
|
|
// Big allocs that requires a copy.
|
|
void* new_buf = alloc(new_size, alignment);
|
|
memcpy(new_buf, buf, std::min(old_size, new_size));
|
|
free(buf, old_size);
|
|
buf = new_buf;
|
|
}
|
|
|
|
return buf;
|
|
}
|
|
|
|
protected:
|
|
static constexpr size_t get_stack_threshold() { return 0; }
|
|
|
|
static constexpr bool clear_memory = clear_memory_;
|
|
|
|
// Freshly mmapped pages are copy-on-write references to a global zero page.
|
|
// On the first write, a page fault occurs, and an actual writable page is
|
|
// allocated. If we are going to use this memory soon, such as when resizing
|
|
// hash tables, it makes sense to pre-fault the pages by passing
|
|
// MAP_POPULATE to mmap(). This takes some time, but should be faster
|
|
// overall than having a hot loop interrupted by page faults.
|
|
// It is only supported on Linux.
|
|
static constexpr int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS
|
|
#if defined(OS_LINUX)
|
|
| (mmap_populate ? MAP_POPULATE : 0)
|
|
#endif
|
|
;
|
|
|
|
private:
|
|
#ifndef NDEBUG
|
|
/// In debug builds, request mmap() at random addresses (a kind of ASLR), to
|
|
/// reproduce more memory stomping bugs. Note that Linux doesn't do it by
|
|
/// default. This may lead to worse TLB performance.
|
|
void* get_mmap_hint() {
|
|
// return reinterpret_cast<void *>(std::uniform_int_distribution<intptr_t>(0x100000000000UL, 0x700000000000UL)(thread_local_rng));
|
|
return nullptr;
|
|
}
|
|
#else
|
|
void* get_mmap_hint() { return nullptr; }
|
|
#endif
|
|
};
|
|
|
|
/** Allocator with optimization to place small memory ranges in automatic memory.
|
|
*/
|
|
template <typename Base, size_t N, size_t Alignment>
|
|
class AllocatorWithStackMemory : private Base {
|
|
private:
|
|
alignas(Alignment) char stack_memory[N];
|
|
|
|
public:
|
|
/// Do not use boost::noncopyable to avoid the warning about direct base
|
|
/// being inaccessible due to ambiguity, when derived classes are also
|
|
/// noncopiable (-Winaccessible-base).
|
|
AllocatorWithStackMemory(const AllocatorWithStackMemory&) = delete;
|
|
AllocatorWithStackMemory& operator=(const AllocatorWithStackMemory&) = delete;
|
|
AllocatorWithStackMemory() = default;
|
|
~AllocatorWithStackMemory() = default;
|
|
|
|
void* alloc(size_t size) {
|
|
if (size <= N) {
|
|
if constexpr (Base::clear_memory) memset(stack_memory, 0, N);
|
|
return stack_memory;
|
|
}
|
|
|
|
return Base::alloc(size, Alignment);
|
|
}
|
|
|
|
void free(void* buf, size_t size) {
|
|
if (size > N) Base::free(buf, size);
|
|
}
|
|
|
|
void* realloc(void* buf, size_t old_size, size_t new_size) {
|
|
/// Was in stack_memory, will remain there.
|
|
if (new_size <= N) return buf;
|
|
|
|
/// Already was big enough to not fit in stack_memory.
|
|
if (old_size > N) return Base::realloc(buf, old_size, new_size, Alignment);
|
|
|
|
/// Was in stack memory, but now will not fit there.
|
|
void* new_buf = Base::alloc(new_size, Alignment);
|
|
memcpy(new_buf, buf, old_size);
|
|
return new_buf;
|
|
}
|
|
|
|
protected:
|
|
static constexpr size_t get_stack_threshold() { return N; }
|
|
};
|