doris/be/src/vec/common/allocator.h

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.
// This file is copied from
// https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/Allocator.h
// and modified by Doris

#pragma once

// TODO: Readable

#include <fmt/format.h>
#include <string.h>

#include <exception>

#include "common/config.h"
#include "common/status.h"
#include "runtime/memory/chunk.h"
#include "runtime/memory/chunk_allocator.h"
#include "runtime/thread_context.h"

#ifdef NDEBUG
#define ALLOCATOR_ASLR 0
#else
#define ALLOCATOR_ASLR 1
#endif

#if !defined(__APPLE__) && !defined(__FreeBSD__)
#include <malloc.h>
#endif

#include <sys/mman.h>

#include <algorithm>
#include <cstdlib>

#include "common/compiler_util.h"
#ifdef THREAD_SANITIZER
/// Thread sanitizer does not intercept mremap. The usage of mremap will lead to false positives.
#define DISABLE_MREMAP 1
#endif
#include "vec/common/allocator_fwd.h"
#include "vec/common/exception.h"
#include "vec/common/mremap.h"

/// Required for older Darwin builds, that lack definition of MAP_ANONYMOUS
#ifndef MAP_ANONYMOUS
#define MAP_ANONYMOUS MAP_ANON
#endif

#ifdef NDEBUG
/**
  * Many modern allocators (for example, tcmalloc) do not do a mremap for
  * realloc, even in case of large enough chunks of memory. Although this allows
  * you to increase performance and reduce memory consumption during realloc.
  * To fix this, we do mremap manually if the chunk of memory is large enough.
  * The threshold (64 MB) is chosen quite large, since changing the address
  * space is very slow, especially in the case of a large number of threads. We
  * expect that the set of operations mmap/something to do/mremap can only be
  * performed about 1000 times per second.
  *
  * P.S. This is also required, because tcmalloc can not allocate a chunk of
  * memory greater than 16 GB.
  */
static constexpr size_t MMAP_THRESHOLD = 64 * (1ULL << 20);
/**
 * Memory allocation between 4KB and 64MB will be through ChunkAllocator,
 * those less than 4KB will be through malloc (for example, tcmalloc),
 * and those greater than 64MB will be through MMAP.
 * In the actual test, chunkallocator allocates less than 4KB of memory slower than malloc,
 * and chunkallocator allocates more than 64MB of memory slower than MMAP,
 * but the 4KB threshold is an empirical value, which needs to be determined
 * by more detailed test later.
  */
static constexpr size_t CHUNK_THRESHOLD = 4096;
#else
/**
  * In debug build, use small mmap threshold to reproduce more memory
  * stomping bugs. Along with ASLR it will hopefully detect more issues than
  * ASan. The program may fail due to the limit on number of memory mappings.
  */
static constexpr size_t MMAP_THRESHOLD = 4096;
static constexpr size_t CHUNK_THRESHOLD = 1024;
#endif

static constexpr size_t MMAP_MIN_ALIGNMENT = 4096;
static constexpr size_t MALLOC_MIN_ALIGNMENT = 8;

/** Responsible for allocating / freeing memory. Used, for example, in PODArray, Arena.
  * Also used in hash tables.
  * The interface is different from std::allocator
  * - the presence of the method realloc, which for large chunks of memory uses mremap;
  * - passing the size into the `free` method;
  * - by the presence of the `alignment` argument;
  * - the possibility of zeroing memory (used in hash tables);
  * - random hint address for mmap
  * - mmap_threshold for using mmap less or more
  */
template <bool clear_memory_, bool mmap_populate>
class Allocator {
public:
    /// Allocate memory range.
    void* alloc(size_t size, size_t alignment = 0) {
        void* buf;

        if (size >= MMAP_THRESHOLD) {
            if (alignment > MMAP_MIN_ALIGNMENT)
                throw doris::vectorized::Exception(
                        fmt::format(
                                "Too large alignment {}: more than page size when allocating {}.",
                                alignment, size),
                        doris::TStatusCode::VEC_BAD_ARGUMENTS);

            CONSUME_THREAD_MEM_TRACKER(size);
            buf = mmap(get_mmap_hint(), size, PROT_READ | PROT_WRITE, mmap_flags, -1, 0);
            if (MAP_FAILED == buf) {
                RELEASE_THREAD_MEM_TRACKER(size);
                doris::vectorized::throwFromErrno(fmt::format("Allocator: Cannot mmap {}.", size),
                                                  doris::TStatusCode::VEC_CANNOT_ALLOCATE_MEMORY);
            }

            /// No need for zero-fill, because mmap guarantees it.
        } else if (!doris::config::disable_chunk_allocator_in_vec && size >= CHUNK_THRESHOLD) {
            doris::Chunk chunk;
            if (!doris::ChunkAllocator::instance()->allocate_align(size, &chunk)) {
                doris::vectorized::throwFromErrno(
                        fmt::format("Allocator: Cannot allocate chunk {}.", size),
                        doris::TStatusCode::VEC_CANNOT_ALLOCATE_MEMORY);
            }
            buf = chunk.data;
            if constexpr (clear_memory) memset(buf, 0, chunk.size);
        } else {
            if (alignment <= MALLOC_MIN_ALIGNMENT) {
                if constexpr (clear_memory)
                    buf = ::calloc(size, 1);
                else
                    buf = ::malloc(size);

                if (nullptr == buf)
                    doris::vectorized::throwFromErrno(
                            fmt::format("Allocator: Cannot malloc {}.", size),
                            doris::TStatusCode::VEC_CANNOT_ALLOCATE_MEMORY);
            } else {
                buf = nullptr;
                int res = posix_memalign(&buf, alignment, size);

                if (0 != res)
                    doris::vectorized::throwFromErrno(
                            fmt::format("Cannot allocate memory (posix_memalign) {}.", size),
                            doris::TStatusCode::VEC_CANNOT_ALLOCATE_MEMORY, res);

                if constexpr (clear_memory) memset(buf, 0, size);
            }
        }
        return buf;
    }

    /// Free memory range.
    void free(void* buf, size_t size) {
        if (size >= MMAP_THRESHOLD) {
            if (0 != munmap(buf, size)) {
                doris::vectorized::throwFromErrno(fmt::format("Allocator: Cannot munmap {}.", size),
                                                  doris::TStatusCode::VEC_CANNOT_MUNMAP);
            } else {
                RELEASE_THREAD_MEM_TRACKER(size);
            }
        } else if (!doris::config::disable_chunk_allocator_in_vec && size >= CHUNK_THRESHOLD &&
                   ((size & (size - 1)) == 0)) {
            // Only power-of-two length are added to ChunkAllocator
            doris::ChunkAllocator::instance()->free((uint8_t*)buf, size);
        } else {
            ::free(buf);
        }
    }

    /** Enlarge memory range.
      * Data from old range is moved to the beginning of new range.
      * Address of memory range could change.
      */
    void* realloc(void* buf, size_t old_size, size_t new_size, size_t alignment = 0) {
        if (old_size == new_size) {
            /// nothing to do.
            /// BTW, it's not possible to change alignment while doing realloc.
        } else if (old_size < MMAP_THRESHOLD && new_size < MMAP_THRESHOLD &&
                   alignment <= MALLOC_MIN_ALIGNMENT) {
            /// Resize malloc'd memory region with no special alignment requirement.
            void* new_buf = ::realloc(buf, new_size);
            if (nullptr == new_buf)
                doris::vectorized::throwFromErrno("Allocator: Cannot realloc from " +
                                                          std::to_string(old_size) + " to " +
                                                          std::to_string(new_size) + ".",
                                                  doris::TStatusCode::VEC_CANNOT_ALLOCATE_MEMORY);

            buf = new_buf;
            if constexpr (clear_memory)
                if (new_size > old_size)
                    memset(reinterpret_cast<char*>(buf) + old_size, 0, new_size - old_size);
        } else if (old_size >= MMAP_THRESHOLD && new_size >= MMAP_THRESHOLD) {
            /// Resize mmap'd memory region.
            CONSUME_THREAD_MEM_TRACKER(new_size - old_size);

            // On apple and freebsd self-implemented mremap used (common/mremap.h)
            buf = clickhouse_mremap(buf, old_size, new_size, MREMAP_MAYMOVE, PROT_READ | PROT_WRITE,
                                    mmap_flags, -1, 0);
            if (MAP_FAILED == buf) {
                RELEASE_THREAD_MEM_TRACKER(new_size - old_size);
                doris::vectorized::throwFromErrno("Allocator: Cannot mremap memory chunk from " +
                                                          std::to_string(old_size) + " to " +
                                                          std::to_string(new_size) + ".",
                                                  doris::TStatusCode::VEC_CANNOT_MREMAP);
            }

            /// No need for zero-fill, because mmap guarantees it.

            if constexpr (mmap_populate) {
                // MAP_POPULATE seems have no effect for mremap as for mmap,
                // Clear enlarged memory range explicitly to pre-fault the pages
                if (new_size > old_size)
                    memset(reinterpret_cast<char*>(buf) + old_size, 0, new_size - old_size);
            }
        } else {
            // Big allocs that requires a copy.
            void* new_buf = alloc(new_size, alignment);
            memcpy(new_buf, buf, std::min(old_size, new_size));
            free(buf, old_size);
            buf = new_buf;
        }

        return buf;
    }

protected:
    static constexpr size_t get_stack_threshold() { return 0; }

    static constexpr bool clear_memory = clear_memory_;

    // Freshly mmapped pages are copy-on-write references to a global zero page.
    // On the first write, a page fault occurs, and an actual writable page is
    // allocated. If we are going to use this memory soon, such as when resizing
    // hash tables, it makes sense to pre-fault the pages by passing
    // MAP_POPULATE to mmap(). This takes some time, but should be faster
    // overall than having a hot loop interrupted by page faults.
    // It is only supported on Linux.
    static constexpr int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS
#if defined(OS_LINUX)
                                      | (mmap_populate ? MAP_POPULATE : 0)
#endif
            ;

private:
#ifndef NDEBUG
    /// In debug builds, request mmap() at random addresses (a kind of ASLR), to
    /// reproduce more memory stomping bugs. Note that Linux doesn't do it by
    /// default. This may lead to worse TLB performance.
    void* get_mmap_hint() {
        // return reinterpret_cast<void *>(std::uniform_int_distribution<intptr_t>(0x100000000000UL, 0x700000000000UL)(thread_local_rng));
        return nullptr;
    }
#else
    void* get_mmap_hint() { return nullptr; }
#endif
};

/** When using AllocatorWithStackMemory, located on the stack,
  *  GCC 4.9 mistakenly assumes that we can call `free` from a pointer to the stack.
  * In fact, the combination of conditions inside AllocatorWithStackMemory does not allow this.
  */
#if !__clang__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wfree-nonheap-object"
#endif

/** Allocator with optimization to place small memory ranges in automatic memory.
  */
template <typename Base, size_t N, size_t Alignment>
class AllocatorWithStackMemory : private Base {
private:
    alignas(Alignment) char stack_memory[N];

public:
    /// Do not use boost::noncopyable to avoid the warning about direct base
    /// being inaccessible due to ambiguity, when derived classes are also
    /// noncopiable (-Winaccessible-base).
    AllocatorWithStackMemory(const AllocatorWithStackMemory&) = delete;
    AllocatorWithStackMemory& operator=(const AllocatorWithStackMemory&) = delete;
    AllocatorWithStackMemory() = default;
    ~AllocatorWithStackMemory() = default;

    void* alloc(size_t size) {
        if (size <= N) {
            if constexpr (Base::clear_memory) memset(stack_memory, 0, N);
            return stack_memory;
        }

        return Base::alloc(size, Alignment);
    }

    void free(void* buf, size_t size) {
        if (size > N) Base::free(buf, size);
    }

    void* realloc(void* buf, size_t old_size, size_t new_size) {
        /// Was in stack_memory, will remain there.
        if (new_size <= N) return buf;

        /// Already was big enough to not fit in stack_memory.
        if (old_size > N) return Base::realloc(buf, old_size, new_size, Alignment);

        /// Was in stack memory, but now will not fit there.
        void* new_buf = Base::alloc(new_size, Alignment);
        memcpy(new_buf, buf, old_size);
        return new_buf;
    }

protected:
    static constexpr size_t get_stack_threshold() { return N; }
};

#if !__clang__
#pragma GCC diagnostic pop
#endif