// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. // This file is copied from // https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/Allocator.h // and modified by Doris #pragma once // TODO: Readable #include #include #include #include "common/config.h" #include "common/status.h" #include "runtime/memory/chunk.h" #include "runtime/memory/chunk_allocator.h" #include "runtime/thread_context.h" #ifdef NDEBUG #define ALLOCATOR_ASLR 0 #else #define ALLOCATOR_ASLR 1 #endif #if !defined(__APPLE__) && !defined(__FreeBSD__) #include #endif #include #include #include #include "common/compiler_util.h" #ifdef THREAD_SANITIZER /// Thread sanitizer does not intercept mremap. The usage of mremap will lead to false positives. #define DISABLE_MREMAP 1 #endif #include "vec/common/allocator_fwd.h" #include "vec/common/exception.h" #include "vec/common/mremap.h" /// Required for older Darwin builds, that lack definition of MAP_ANONYMOUS #ifndef MAP_ANONYMOUS #define MAP_ANONYMOUS MAP_ANON #endif #ifdef NDEBUG /** * Many modern allocators (for example, tcmalloc) do not do a mremap for * realloc, even in case of large enough chunks of memory. Although this allows * you to increase performance and reduce memory consumption during realloc. * To fix this, we do mremap manually if the chunk of memory is large enough. * The threshold (64 MB) is chosen quite large, since changing the address * space is very slow, especially in the case of a large number of threads. We * expect that the set of operations mmap/something to do/mremap can only be * performed about 1000 times per second. * * P.S. This is also required, because tcmalloc can not allocate a chunk of * memory greater than 16 GB. */ static constexpr size_t MMAP_THRESHOLD = 64 * (1ULL << 20); /** * Memory allocation between 4KB and 64MB will be through ChunkAllocator, * those less than 4KB will be through malloc (for example, tcmalloc), * and those greater than 64MB will be through MMAP. * In the actual test, chunkallocator allocates less than 4KB of memory slower than malloc, * and chunkallocator allocates more than 64MB of memory slower than MMAP, * but the 4KB threshold is an empirical value, which needs to be determined * by more detailed test later. */ static constexpr size_t CHUNK_THRESHOLD = 4096; #else /** * In debug build, use small mmap threshold to reproduce more memory * stomping bugs. Along with ASLR it will hopefully detect more issues than * ASan. The program may fail due to the limit on number of memory mappings. */ static constexpr size_t MMAP_THRESHOLD = 4096; static constexpr size_t CHUNK_THRESHOLD = 1024; #endif static constexpr size_t MMAP_MIN_ALIGNMENT = 4096; static constexpr size_t MALLOC_MIN_ALIGNMENT = 8; /** Responsible for allocating / freeing memory. Used, for example, in PODArray, Arena. * Also used in hash tables. * The interface is different from std::allocator * - the presence of the method realloc, which for large chunks of memory uses mremap; * - passing the size into the `free` method; * - by the presence of the `alignment` argument; * - the possibility of zeroing memory (used in hash tables); * - random hint address for mmap * - mmap_threshold for using mmap less or more */ template class Allocator { public: /// Allocate memory range. void* alloc(size_t size, size_t alignment = 0) { void* buf; if (size >= MMAP_THRESHOLD) { if (alignment > MMAP_MIN_ALIGNMENT) throw doris::vectorized::Exception( fmt::format( "Too large alignment {}: more than page size when allocating {}.", alignment, size), doris::TStatusCode::VEC_BAD_ARGUMENTS); CONSUME_THREAD_MEM_TRACKER(size); buf = mmap(get_mmap_hint(), size, PROT_READ | PROT_WRITE, mmap_flags, -1, 0); if (MAP_FAILED == buf) { RELEASE_THREAD_MEM_TRACKER(size); auto err = fmt::format("Allocator: Cannot mmap {}.", size); doris::ExecEnv::GetInstance()->process_mem_tracker()->print_log_usage(err); doris::vectorized::throwFromErrno(err, doris::TStatusCode::VEC_CANNOT_ALLOCATE_MEMORY); } /// No need for zero-fill, because mmap guarantees it. } else if (!doris::config::disable_chunk_allocator_in_vec && size >= CHUNK_THRESHOLD) { doris::Chunk chunk; if (!doris::ChunkAllocator::instance()->allocate_align(size, &chunk)) { auto err = fmt::format("Allocator: Cannot allocate chunk {}.", size); doris::ExecEnv::GetInstance()->process_mem_tracker()->print_log_usage(err); doris::vectorized::throwFromErrno(err, doris::TStatusCode::VEC_CANNOT_ALLOCATE_MEMORY); } buf = chunk.data; if constexpr (clear_memory) memset(buf, 0, chunk.size); } else { if (alignment <= MALLOC_MIN_ALIGNMENT) { if constexpr (clear_memory) buf = ::calloc(size, 1); else buf = ::malloc(size); if (nullptr == buf) { auto err = fmt::format("Allocator: Cannot malloc {}.", size); doris::ExecEnv::GetInstance()->process_mem_tracker()->print_log_usage(err); doris::vectorized::throwFromErrno( err, doris::TStatusCode::VEC_CANNOT_ALLOCATE_MEMORY); } } else { buf = nullptr; int res = posix_memalign(&buf, alignment, size); if (0 != res) { auto err = fmt::format("Cannot allocate memory (posix_memalign) {}.", size); doris::ExecEnv::GetInstance()->process_mem_tracker()->print_log_usage(err); doris::vectorized::throwFromErrno( err, doris::TStatusCode::VEC_CANNOT_ALLOCATE_MEMORY, res); } if constexpr (clear_memory) memset(buf, 0, size); } } return buf; } /// Free memory range. void free(void* buf, size_t size) { if (size >= MMAP_THRESHOLD) { if (0 != munmap(buf, size)) { auto err = fmt::format("Allocator: Cannot munmap {}.", size); doris::ExecEnv::GetInstance()->process_mem_tracker()->print_log_usage(err); doris::vectorized::throwFromErrno(err, doris::TStatusCode::VEC_CANNOT_MUNMAP); } else { RELEASE_THREAD_MEM_TRACKER(size); } } else if (!doris::config::disable_chunk_allocator_in_vec && size >= CHUNK_THRESHOLD && ((size & (size - 1)) == 0)) { // Only power-of-two length are added to ChunkAllocator doris::ChunkAllocator::instance()->free((uint8_t*)buf, size); } else { ::free(buf); } } /** Enlarge memory range. * Data from old range is moved to the beginning of new range. * Address of memory range could change. */ void* realloc(void* buf, size_t old_size, size_t new_size, size_t alignment = 0) { if (old_size == new_size) { /// nothing to do. /// BTW, it's not possible to change alignment while doing realloc. } else if (old_size < CHUNK_THRESHOLD && new_size < CHUNK_THRESHOLD && alignment <= MALLOC_MIN_ALIGNMENT) { /// Resize malloc'd memory region with no special alignment requirement. void* new_buf = ::realloc(buf, new_size); if (nullptr == new_buf) { auto err = fmt::format("Allocator: Cannot realloc from {} to {}.", old_size, new_size); doris::ExecEnv::GetInstance()->process_mem_tracker()->print_log_usage(err); doris::vectorized::throwFromErrno(err, doris::TStatusCode::VEC_CANNOT_ALLOCATE_MEMORY); } buf = new_buf; if constexpr (clear_memory) if (new_size > old_size) memset(reinterpret_cast(buf) + old_size, 0, new_size - old_size); } else if (old_size >= MMAP_THRESHOLD && new_size >= MMAP_THRESHOLD) { /// Resize mmap'd memory region. CONSUME_THREAD_MEM_TRACKER(new_size - old_size); // On apple and freebsd self-implemented mremap used (common/mremap.h) buf = clickhouse_mremap(buf, old_size, new_size, MREMAP_MAYMOVE, PROT_READ | PROT_WRITE, mmap_flags, -1, 0); if (MAP_FAILED == buf) { RELEASE_THREAD_MEM_TRACKER(new_size - old_size); auto err = fmt::format("Allocator: Cannot mremap memory chunk from {} to {}.", old_size, new_size); doris::ExecEnv::GetInstance()->process_mem_tracker()->print_log_usage(err); doris::vectorized::throwFromErrno(err, doris::TStatusCode::VEC_CANNOT_MREMAP); } /// No need for zero-fill, because mmap guarantees it. if constexpr (mmap_populate) { // MAP_POPULATE seems have no effect for mremap as for mmap, // Clear enlarged memory range explicitly to pre-fault the pages if (new_size > old_size) memset(reinterpret_cast(buf) + old_size, 0, new_size - old_size); } } else { // CHUNK_THRESHOLD <= old_size <= MMAP_THRESHOLD use system realloc is slow, use ChunkAllocator. // Big allocs that requires a copy. void* new_buf = alloc(new_size, alignment); memcpy(new_buf, buf, std::min(old_size, new_size)); free(buf, old_size); buf = new_buf; } return buf; } protected: static constexpr size_t get_stack_threshold() { return 0; } static constexpr bool clear_memory = clear_memory_; // Freshly mmapped pages are copy-on-write references to a global zero page. // On the first write, a page fault occurs, and an actual writable page is // allocated. If we are going to use this memory soon, such as when resizing // hash tables, it makes sense to pre-fault the pages by passing // MAP_POPULATE to mmap(). This takes some time, but should be faster // overall than having a hot loop interrupted by page faults. // It is only supported on Linux. static constexpr int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS #if defined(OS_LINUX) | (mmap_populate ? MAP_POPULATE : 0) #endif ; private: #ifndef NDEBUG /// In debug builds, request mmap() at random addresses (a kind of ASLR), to /// reproduce more memory stomping bugs. Note that Linux doesn't do it by /// default. This may lead to worse TLB performance. void* get_mmap_hint() { // return reinterpret_cast(std::uniform_int_distribution(0x100000000000UL, 0x700000000000UL)(thread_local_rng)); return nullptr; } #else void* get_mmap_hint() { return nullptr; } #endif }; /** When using AllocatorWithStackMemory, located on the stack, * GCC 4.9 mistakenly assumes that we can call `free` from a pointer to the stack. * In fact, the combination of conditions inside AllocatorWithStackMemory does not allow this. */ #if !__clang__ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wfree-nonheap-object" #endif /** Allocator with optimization to place small memory ranges in automatic memory. */ template class AllocatorWithStackMemory : private Base { private: alignas(Alignment) char stack_memory[N]; public: /// Do not use boost::noncopyable to avoid the warning about direct base /// being inaccessible due to ambiguity, when derived classes are also /// noncopiable (-Winaccessible-base). AllocatorWithStackMemory(const AllocatorWithStackMemory&) = delete; AllocatorWithStackMemory& operator=(const AllocatorWithStackMemory&) = delete; AllocatorWithStackMemory() = default; ~AllocatorWithStackMemory() = default; void* alloc(size_t size) { if (size <= N) { if constexpr (Base::clear_memory) memset(stack_memory, 0, N); return stack_memory; } return Base::alloc(size, Alignment); } void free(void* buf, size_t size) { if (size > N) Base::free(buf, size); } void* realloc(void* buf, size_t old_size, size_t new_size) { /// Was in stack_memory, will remain there. if (new_size <= N) return buf; /// Already was big enough to not fit in stack_memory. if (old_size > N) return Base::realloc(buf, old_size, new_size, Alignment); /// Was in stack memory, but now will not fit there. void* new_buf = Base::alloc(new_size, Alignment); memcpy(new_buf, buf, old_size); return new_buf; } protected: static constexpr size_t get_stack_threshold() { return N; } }; #if !__clang__ #pragma GCC diagnostic pop #endif