Disable Chunk Allocator in Vectorized Allocator, this will reduce memory cache. For high concurrent queries, using Chunk Allocator with vectorized Allocator can reduce the impact of gperftools tcmalloc central lock. Jemalloc or google tcmalloc have core cache, Chunk Allocator may no longer be needed after replacing gperftools tcmalloc.
336 lines
13 KiB
C++
336 lines
13 KiB
C++
// Licensed to the Apache Software Foundation (ASF) under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing,
|
|
// software distributed under the License is distributed on an
|
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
// KIND, either express or implied. See the License for the
|
|
// specific language governing permissions and limitations
|
|
// under the License.
|
|
// This file is copied from
|
|
// https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/Allocator.h
|
|
// and modified by Doris
|
|
|
|
#pragma once
|
|
|
|
// TODO: Readable
|
|
|
|
#include <fmt/format.h>
|
|
#include <string.h>
|
|
|
|
#include <exception>
|
|
|
|
#include "common/config.h"
|
|
#include "common/status.h"
|
|
#include "runtime/memory/chunk.h"
|
|
#include "runtime/memory/chunk_allocator.h"
|
|
#include "runtime/thread_context.h"
|
|
|
|
#ifdef NDEBUG
|
|
#define ALLOCATOR_ASLR 0
|
|
#else
|
|
#define ALLOCATOR_ASLR 1
|
|
#endif
|
|
|
|
#if !defined(__APPLE__) && !defined(__FreeBSD__)
|
|
#include <malloc.h>
|
|
#endif
|
|
|
|
#include <sys/mman.h>
|
|
|
|
#include <algorithm>
|
|
#include <cstdlib>
|
|
|
|
#include "common/compiler_util.h"
|
|
#ifdef THREAD_SANITIZER
|
|
/// Thread sanitizer does not intercept mremap. The usage of mremap will lead to false positives.
|
|
#define DISABLE_MREMAP 1
|
|
#endif
|
|
#include "vec/common/allocator_fwd.h"
|
|
#include "vec/common/exception.h"
|
|
#include "vec/common/mremap.h"
|
|
|
|
/// Required for older Darwin builds, that lack definition of MAP_ANONYMOUS
|
|
#ifndef MAP_ANONYMOUS
|
|
#define MAP_ANONYMOUS MAP_ANON
|
|
#endif
|
|
|
|
#ifdef NDEBUG
|
|
/**
|
|
* Many modern allocators (for example, tcmalloc) do not do a mremap for
|
|
* realloc, even in case of large enough chunks of memory. Although this allows
|
|
* you to increase performance and reduce memory consumption during realloc.
|
|
* To fix this, we do mremap manually if the chunk of memory is large enough.
|
|
* The threshold (64 MB) is chosen quite large, since changing the address
|
|
* space is very slow, especially in the case of a large number of threads. We
|
|
* expect that the set of operations mmap/something to do/mremap can only be
|
|
* performed about 1000 times per second.
|
|
*
|
|
* P.S. This is also required, because tcmalloc can not allocate a chunk of
|
|
* memory greater than 16 GB.
|
|
*/
|
|
static constexpr size_t MMAP_THRESHOLD = 64 * (1ULL << 20);
|
|
/**
|
|
* Memory allocation between 4KB and 64MB will be through ChunkAllocator,
|
|
* those less than 4KB will be through malloc (for example, tcmalloc),
|
|
* and those greater than 64MB will be through MMAP.
|
|
* In the actual test, chunkallocator allocates less than 4KB of memory slower than malloc,
|
|
* and chunkallocator allocates more than 64MB of memory slower than MMAP,
|
|
* but the 4KB threshold is an empirical value, which needs to be determined
|
|
* by more detailed test later.
|
|
*/
|
|
static constexpr size_t CHUNK_THRESHOLD = 4096;
|
|
#else
|
|
/**
|
|
* In debug build, use small mmap threshold to reproduce more memory
|
|
* stomping bugs. Along with ASLR it will hopefully detect more issues than
|
|
* ASan. The program may fail due to the limit on number of memory mappings.
|
|
*/
|
|
static constexpr size_t MMAP_THRESHOLD = 4096;
|
|
static constexpr size_t CHUNK_THRESHOLD = 1024;
|
|
#endif
|
|
|
|
static constexpr size_t MMAP_MIN_ALIGNMENT = 4096;
|
|
static constexpr size_t MALLOC_MIN_ALIGNMENT = 8;
|
|
|
|
/** Responsible for allocating / freeing memory. Used, for example, in PODArray, Arena.
|
|
* Also used in hash tables.
|
|
* The interface is different from std::allocator
|
|
* - the presence of the method realloc, which for large chunks of memory uses mremap;
|
|
* - passing the size into the `free` method;
|
|
* - by the presence of the `alignment` argument;
|
|
* - the possibility of zeroing memory (used in hash tables);
|
|
* - random hint address for mmap
|
|
* - mmap_threshold for using mmap less or more
|
|
*/
|
|
template <bool clear_memory_, bool mmap_populate>
|
|
class Allocator {
|
|
public:
|
|
/// Allocate memory range.
|
|
void* alloc(size_t size, size_t alignment = 0) {
|
|
void* buf;
|
|
|
|
if (size >= MMAP_THRESHOLD) {
|
|
if (alignment > MMAP_MIN_ALIGNMENT)
|
|
throw doris::vectorized::Exception(
|
|
fmt::format(
|
|
"Too large alignment {}: more than page size when allocating {}.",
|
|
alignment, size),
|
|
doris::TStatusCode::VEC_BAD_ARGUMENTS);
|
|
|
|
CONSUME_THREAD_MEM_TRACKER(size);
|
|
buf = mmap(get_mmap_hint(), size, PROT_READ | PROT_WRITE, mmap_flags, -1, 0);
|
|
if (MAP_FAILED == buf) {
|
|
RELEASE_THREAD_MEM_TRACKER(size);
|
|
doris::vectorized::throwFromErrno(fmt::format("Allocator: Cannot mmap {}.", size),
|
|
doris::TStatusCode::VEC_CANNOT_ALLOCATE_MEMORY);
|
|
}
|
|
|
|
/// No need for zero-fill, because mmap guarantees it.
|
|
} else if (!doris::config::disable_chunk_allocator_in_vec && size >= CHUNK_THRESHOLD) {
|
|
doris::Chunk chunk;
|
|
if (!doris::ChunkAllocator::instance()->allocate_align(size, &chunk)) {
|
|
doris::vectorized::throwFromErrno(
|
|
fmt::format("Allocator: Cannot allocate chunk {}.", size),
|
|
doris::TStatusCode::VEC_CANNOT_ALLOCATE_MEMORY);
|
|
}
|
|
buf = chunk.data;
|
|
if constexpr (clear_memory) memset(buf, 0, chunk.size);
|
|
} else {
|
|
if (alignment <= MALLOC_MIN_ALIGNMENT) {
|
|
if constexpr (clear_memory)
|
|
buf = ::calloc(size, 1);
|
|
else
|
|
buf = ::malloc(size);
|
|
|
|
if (nullptr == buf)
|
|
doris::vectorized::throwFromErrno(
|
|
fmt::format("Allocator: Cannot malloc {}.", size),
|
|
doris::TStatusCode::VEC_CANNOT_ALLOCATE_MEMORY);
|
|
} else {
|
|
buf = nullptr;
|
|
int res = posix_memalign(&buf, alignment, size);
|
|
|
|
if (0 != res)
|
|
doris::vectorized::throwFromErrno(
|
|
fmt::format("Cannot allocate memory (posix_memalign) {}.", size),
|
|
doris::TStatusCode::VEC_CANNOT_ALLOCATE_MEMORY, res);
|
|
|
|
if constexpr (clear_memory) memset(buf, 0, size);
|
|
}
|
|
}
|
|
return buf;
|
|
}
|
|
|
|
/// Free memory range.
|
|
void free(void* buf, size_t size) {
|
|
if (size >= MMAP_THRESHOLD) {
|
|
if (0 != munmap(buf, size)) {
|
|
doris::vectorized::throwFromErrno(fmt::format("Allocator: Cannot munmap {}.", size),
|
|
doris::TStatusCode::VEC_CANNOT_MUNMAP);
|
|
} else {
|
|
RELEASE_THREAD_MEM_TRACKER(size);
|
|
}
|
|
} else if (!doris::config::disable_chunk_allocator_in_vec && size >= CHUNK_THRESHOLD &&
|
|
((size & (size - 1)) == 0)) {
|
|
// Only power-of-two length are added to ChunkAllocator
|
|
doris::ChunkAllocator::instance()->free((uint8_t*)buf, size);
|
|
} else {
|
|
::free(buf);
|
|
}
|
|
}
|
|
|
|
/** Enlarge memory range.
|
|
* Data from old range is moved to the beginning of new range.
|
|
* Address of memory range could change.
|
|
*/
|
|
void* realloc(void* buf, size_t old_size, size_t new_size, size_t alignment = 0) {
|
|
if (old_size == new_size) {
|
|
/// nothing to do.
|
|
/// BTW, it's not possible to change alignment while doing realloc.
|
|
} else if (old_size < MMAP_THRESHOLD && new_size < MMAP_THRESHOLD &&
|
|
alignment <= MALLOC_MIN_ALIGNMENT) {
|
|
/// Resize malloc'd memory region with no special alignment requirement.
|
|
void* new_buf = ::realloc(buf, new_size);
|
|
if (nullptr == new_buf)
|
|
doris::vectorized::throwFromErrno("Allocator: Cannot realloc from " +
|
|
std::to_string(old_size) + " to " +
|
|
std::to_string(new_size) + ".",
|
|
doris::TStatusCode::VEC_CANNOT_ALLOCATE_MEMORY);
|
|
|
|
buf = new_buf;
|
|
if constexpr (clear_memory)
|
|
if (new_size > old_size)
|
|
memset(reinterpret_cast<char*>(buf) + old_size, 0, new_size - old_size);
|
|
} else if (old_size >= MMAP_THRESHOLD && new_size >= MMAP_THRESHOLD) {
|
|
/// Resize mmap'd memory region.
|
|
CONSUME_THREAD_MEM_TRACKER(new_size - old_size);
|
|
|
|
// On apple and freebsd self-implemented mremap used (common/mremap.h)
|
|
buf = clickhouse_mremap(buf, old_size, new_size, MREMAP_MAYMOVE, PROT_READ | PROT_WRITE,
|
|
mmap_flags, -1, 0);
|
|
if (MAP_FAILED == buf) {
|
|
RELEASE_THREAD_MEM_TRACKER(new_size - old_size);
|
|
doris::vectorized::throwFromErrno("Allocator: Cannot mremap memory chunk from " +
|
|
std::to_string(old_size) + " to " +
|
|
std::to_string(new_size) + ".",
|
|
doris::TStatusCode::VEC_CANNOT_MREMAP);
|
|
}
|
|
|
|
/// No need for zero-fill, because mmap guarantees it.
|
|
|
|
if constexpr (mmap_populate) {
|
|
// MAP_POPULATE seems have no effect for mremap as for mmap,
|
|
// Clear enlarged memory range explicitly to pre-fault the pages
|
|
if (new_size > old_size)
|
|
memset(reinterpret_cast<char*>(buf) + old_size, 0, new_size - old_size);
|
|
}
|
|
} else {
|
|
// Big allocs that requires a copy.
|
|
void* new_buf = alloc(new_size, alignment);
|
|
memcpy(new_buf, buf, std::min(old_size, new_size));
|
|
free(buf, old_size);
|
|
buf = new_buf;
|
|
}
|
|
|
|
return buf;
|
|
}
|
|
|
|
protected:
|
|
static constexpr size_t get_stack_threshold() { return 0; }
|
|
|
|
static constexpr bool clear_memory = clear_memory_;
|
|
|
|
// Freshly mmapped pages are copy-on-write references to a global zero page.
|
|
// On the first write, a page fault occurs, and an actual writable page is
|
|
// allocated. If we are going to use this memory soon, such as when resizing
|
|
// hash tables, it makes sense to pre-fault the pages by passing
|
|
// MAP_POPULATE to mmap(). This takes some time, but should be faster
|
|
// overall than having a hot loop interrupted by page faults.
|
|
// It is only supported on Linux.
|
|
static constexpr int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS
|
|
#if defined(OS_LINUX)
|
|
| (mmap_populate ? MAP_POPULATE : 0)
|
|
#endif
|
|
;
|
|
|
|
private:
|
|
#ifndef NDEBUG
|
|
/// In debug builds, request mmap() at random addresses (a kind of ASLR), to
|
|
/// reproduce more memory stomping bugs. Note that Linux doesn't do it by
|
|
/// default. This may lead to worse TLB performance.
|
|
void* get_mmap_hint() {
|
|
// return reinterpret_cast<void *>(std::uniform_int_distribution<intptr_t>(0x100000000000UL, 0x700000000000UL)(thread_local_rng));
|
|
return nullptr;
|
|
}
|
|
#else
|
|
void* get_mmap_hint() { return nullptr; }
|
|
#endif
|
|
};
|
|
|
|
/** When using AllocatorWithStackMemory, located on the stack,
|
|
* GCC 4.9 mistakenly assumes that we can call `free` from a pointer to the stack.
|
|
* In fact, the combination of conditions inside AllocatorWithStackMemory does not allow this.
|
|
*/
|
|
#if !__clang__
|
|
#pragma GCC diagnostic push
|
|
#pragma GCC diagnostic ignored "-Wfree-nonheap-object"
|
|
#endif
|
|
|
|
/** Allocator with optimization to place small memory ranges in automatic memory.
|
|
*/
|
|
template <typename Base, size_t N, size_t Alignment>
|
|
class AllocatorWithStackMemory : private Base {
|
|
private:
|
|
alignas(Alignment) char stack_memory[N];
|
|
|
|
public:
|
|
/// Do not use boost::noncopyable to avoid the warning about direct base
|
|
/// being inaccessible due to ambiguity, when derived classes are also
|
|
/// noncopiable (-Winaccessible-base).
|
|
AllocatorWithStackMemory(const AllocatorWithStackMemory&) = delete;
|
|
AllocatorWithStackMemory& operator=(const AllocatorWithStackMemory&) = delete;
|
|
AllocatorWithStackMemory() = default;
|
|
~AllocatorWithStackMemory() = default;
|
|
|
|
void* alloc(size_t size) {
|
|
if (size <= N) {
|
|
if constexpr (Base::clear_memory) memset(stack_memory, 0, N);
|
|
return stack_memory;
|
|
}
|
|
|
|
return Base::alloc(size, Alignment);
|
|
}
|
|
|
|
void free(void* buf, size_t size) {
|
|
if (size > N) Base::free(buf, size);
|
|
}
|
|
|
|
void* realloc(void* buf, size_t old_size, size_t new_size) {
|
|
/// Was in stack_memory, will remain there.
|
|
if (new_size <= N) return buf;
|
|
|
|
/// Already was big enough to not fit in stack_memory.
|
|
if (old_size > N) return Base::realloc(buf, old_size, new_size, Alignment);
|
|
|
|
/// Was in stack memory, but now will not fit there.
|
|
void* new_buf = Base::alloc(new_size, Alignment);
|
|
memcpy(new_buf, buf, old_size);
|
|
return new_buf;
|
|
}
|
|
|
|
protected:
|
|
static constexpr size_t get_stack_threshold() { return N; }
|
|
};
|
|
|
|
#if !__clang__
|
|
#pragma GCC diagnostic pop
|
|
#endif
|