// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #pragma once #include "runtime/bufferpool/buffer_pool_internal.h" #include "runtime/bufferpool/free_list.h" #include "runtime/memory/mem_tracker.h" #include "util/aligned_new.h" namespace doris { /// The internal buffer allocator used by BufferPool to allocator power-of-two sized /// buffers. BufferAllocator builds on top of SystemAllocator by adding caching of /// free buffers and clean pages where the memory is not currently in use by a client /// but has not yet been released to SystemAllocator. /// /// The allocator is optimised for the common case where an allocation can be served /// by reclaiming a buffer of the request size from the current core's arena. In this /// case there is no contention for locks between concurrently-running threads. If this /// fails, progressively more expensive approaches to allocate memory are tried until /// the allocation eventually success (see AllocateInternal() for details). /// /// Buffer Reservations /// =================== /// The implementation of the BufferAllocator relies on the BufferPool's reservation /// tracking system. The allocator is given a hard limit ('system_bytes_limit'), above /// which all allocations will fail. Allocations up to 'system_bytes_limit' are /// guaranteed to succeed unless an unexpected system error occurs (e.g. we can't allocate /// all of the required memory from the OS). Reservations must be set up so that the total /// of all reservations does not exceed 'system_bytes_limit', thus ensuring that /// BufferAllocator can always find memory to fulfill reservations. /// /// +========================+ /// | IMPLEMENTATION NOTES | /// +========================+ /// /// Memory /// ====== /// Memory managed by BufferAllocator comes in four forms: /// 1. Buffers returned to the client (corresponding to a used reservation) /// 2. Free buffers cached in the BufferAllocator's free lists. /// 3. Buffers attached to clean unpinned pages in the BufferAllocator's clean page lists. /// 4. Bytes that are not allocated from the system: 'system_bytes_remaining_'. /// Together these always add up to 'system_bytes_limit', which allows BufferAllocator /// to always fulfill reservations via some combination of memory in forms 2, 3 or 4. /// /// The BufferAllocator code is careful not to make memory inaccessible to concurrently /// executing threads that are entitled to it. E.g. if one thread is entitled to allocate /// a 1MB buffer from the BufferAllocator's free or clean page lists but needs to release /// a 2MB buffer to the system to free up enough memory, it must add 1MB to /// 'system_bytes_remaining_' in the same critical section in which it freed the 2MB /// buffer. Otherwise a concurrent thread that had a reservation for 1MB of memory might /// not be able to find it. /// /// Arenas /// ====== /// The buffer allocator's data structures are broken up into arenas, with an arena per /// core. Within each arena, each buffer or page is stored in a list with buffers and /// pages of the same size: there is a separate list for every power-of-two size. Each /// arena is protected by a separate lock, so in the common case where threads are able /// to fulfill allocations from their own arena, there will be no lock contention. /// struct BufferPool::BufferAllocator { BufferAllocator(BufferPool* pool, int64_t min_buffer_len, int64_t system_bytes_limit, int64_t clean_page_bytes_limit); ~BufferAllocator(); /// Allocate a buffer with a power-of-two length 'len'. This function may acquire /// 'FreeBufferArena::lock_' and Page::lock so no locks lower in the lock acquisition /// order (see buffer-pool-internal.h) should be held by the caller. /// /// Always succeeds on allocating memory up to 'system_bytes_limit', unless the system /// is unable to give us 'system_bytes_limit' of memory or an internal bug: if all /// clients write out enough dirty pages to stay within their reservation, then there /// should always be enough free buffers and clean pages to reclaim. Status Allocate(ClientHandle* client, int64_t len, BufferPool::BufferHandle* buffer) WARN_UNUSED_RESULT; /// Frees 'buffer', which must be open before calling. Closes 'buffer' and updates /// internal state but does not release to any reservation. void Free(BufferPool::BufferHandle&& buffer); /// Adds a clean page 'page' to a clean page list. Caller must hold the page's /// client's lock via 'client_lock' so that moving the page between the client list and /// the free page list is atomic. Caller must not hold 'FreeBufferArena::lock_' or any /// Page::lock. void AddCleanPage(const std::unique_lock& client_lock, Page* page); /// Removes a clean page 'page' from a clean page list and returns true, if present in /// one of the lists. Returns true if it was present. If 'claim_buffer' is true, the /// caller must have reservation for the buffer, which is returned along with the page. /// Otherwise the buffer is moved directly to the free buffer list. Caller must hold /// the page's client's lock via 'client_lock' so that moving the page between the /// client list and the free page list is atomic. Caller must not hold /// 'FreeBufferArena::lock_' or any Page::lock. bool RemoveCleanPage(const std::unique_lock& client_lock, bool claim_buffer, Page* page); /// Periodically called to release free buffers back to the SystemAllocator. Releases /// buffers based on recent allocation patterns, trying to minimise the number of /// excess buffers retained in each list above the minimum required to avoid going /// to the system allocator. void Maintenance(); /// Try to release at least 'bytes_to_free' bytes of memory to the system allocator. void ReleaseMemory(int64_t bytes_to_free); int64_t system_bytes_limit() const { return system_bytes_limit_; } /// Return the amount of memory currently allocated from the system. int64_t GetSystemBytesAllocated() const { return system_bytes_limit_ - system_bytes_remaining_.load(); } /// Return the total number of free buffers in the allocator. int64_t GetNumFreeBuffers() const; /// Return the total bytes of free buffers in the allocator. int64_t GetFreeBufferBytes() const; /// Return the limit on bytes of clean pages in the allocator. int64_t GetCleanPageBytesLimit() const; /// Return the total number of clean pages in the allocator. int64_t GetNumCleanPages() const; /// Return the total bytes of clean pages in the allocator. int64_t GetCleanPageBytes() const; std::string DebugString(); protected: friend class BufferAllocatorTest; friend class BufferPoolTest; friend class FreeBufferArena; /// Test helper: gets the current size of the free list for buffers of 'len' bytes /// on core 'core'. int GetFreeListSize(int core, int64_t len); /// Test helper: reduce the number of scavenge attempts so backend tests can force /// use of the "locked" scavenging code path. void set_max_scavenge_attempts(int val) { DCHECK_GE(val, 1); max_scavenge_attempts_ = val; } private: /// Compute the maximum power-of-two buffer length that could be allocated based on the /// amount of memory available 'system_bytes_limit'. The value is always at least /// 'min_buffer_len' so that there is at least one valid buffer size. static int64_t CalcMaxBufferLen(int64_t min_buffer_len, int64_t system_bytes_limit); /// Same as Allocate() but leaves 'buffer->client_' nullptr and does not update counters. Status AllocateInternal(int64_t len, BufferPool::BufferHandle* buffer) WARN_UNUSED_RESULT; /// Tries to reclaim enough memory from various sources so that the caller can allocate /// a buffer of 'target_bytes' from the system allocator. Scavenges buffers from the /// free buffer and clean page lists of all cores and frees them with /// 'system_allocator_'. Also tries to decrement 'system_bytes_remaining_'. /// 'current_core' is the index of the current CPU core. Any bytes freed in excess of /// 'target_bytes' are added to 'system_bytes_remaining_.' If 'slow_but_sure' is true, /// this function uses a slower strategy that guarantees enough memory will be found /// but can block progress of other threads for longer. If 'slow_but_sure' is false, /// then this function optimistically tries to reclaim the memory but may not reclaim /// 'target_bytes' of memory. Returns the number of bytes reclaimed. int64_t ScavengeBuffers(bool slow_but_sure, int current_core, int64_t target_bytes); /// Helper to free a list of buffers to the system. Returns the number of bytes freed. int64_t FreeToSystem(std::vector&& buffers); /// Compute a sum over all arenas. Does not lock the arenas. int64_t SumOverArenas(std::function compute_fn) const; /// The pool that this allocator is associated with. BufferPool* const pool_; /// System allocator that is ultimately used to allocate and free buffers. const std::unique_ptr system_allocator_; /// The minimum power-of-two buffer length that can be allocated. const int64_t min_buffer_len_; /// The maximum power-of-two buffer length that can be allocated. Always >= /// 'min_buffer_len' so that there is at least one valid buffer size. const int64_t max_buffer_len_; /// The log2 of 'min_buffer_len_'. const int log_min_buffer_len_; /// The log2 of 'max_buffer_len_'. const int log_max_buffer_len_; /// The maximum physical memory in bytes that will be allocated from the system. const int64_t system_bytes_limit_; /// The remaining number of bytes of 'system_bytes_limit_' that can be used for /// allocating new buffers. Must be updated atomically before a new buffer is /// allocated or after an existing buffer is freed with the system allocator. std::atomic system_bytes_remaining_; /// The maximum bytes of clean pages that can accumulate across all arenas before /// they will be evicted. const int64_t clean_page_bytes_limit_; /// The number of bytes of 'clean_page_bytes_limit_' not used by clean pages. I.e. /// (clean_page_bytes_limit - bytes of clean pages in the BufferAllocator). /// 'clean_pages_bytes_limit_' is enforced by increasing this value before a /// clean page is added and decreasing it after a clean page is reclaimed or evicted. std::atomic clean_page_bytes_remaining_; /// Free and clean pages. One arena per core. std::vector> per_core_arenas_; /// Default number of times to attempt scavenging. static const int MAX_SCAVENGE_ATTEMPTS = 3; /// Number of times to attempt scavenging. Usually MAX_SCAVENGE_ATTEMPTS but can be /// overridden by tests. The first max_scavenge_attempts_ - 1 attempts do not lock /// all arenas so may fail. The final attempt locks all arenas, which is expensive /// but is guaranteed to succeed. int max_scavenge_attempts_; std::unique_ptr _mem_tracker; }; } // namespace doris