tidb/pkg/util/memory/arbitrator.go

// Copyright 2025 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package memory

import (
	"errors"
	"fmt"
	"sync"
	"sync/atomic"
	"time"

	"go.uber.org/zap"
)

// ArbitrateResult represents the results of the arbitration process
type ArbitrateResult int32

const (
	// ArbitrateOk indicates that the arbitration is successful.
	ArbitrateOk ArbitrateResult = iota
	// ArbitrateFail indicates that the arbitration is failed
	ArbitrateFail
)

// SoftLimitMode represents the mode of soft limit for the mem-arbitrator
type SoftLimitMode int32

const (
	// SoftLimitModeDisable indicates that soft-limit is same as the threshold of oom risk
	SoftLimitModeDisable SoftLimitMode = iota
	// SoftLimitModeSpecified indicates that the soft-limit is a specified num of bytes or rate of the limit
	SoftLimitModeSpecified
	// SoftLimitModeAuto indicates that the soft-limit is auto calculated by the mem-arbitrator
	SoftLimitModeAuto
)

const (
	// ArbitratorSoftLimitModDisableName is the name of the soft limit mode default
	ArbitratorSoftLimitModDisableName = "0"
	// ArbitratorSoftLimitModeAutoName is the name of the soft limit mode auto
	ArbitratorSoftLimitModeAutoName = "auto"
	// ArbitratorModeStandardName is the name of the standard mode
	ArbitratorModeStandardName = "standard"
	// ArbitratorModePriorityName is the name of the priority mode
	ArbitratorModePriorityName = "priority"
	// ArbitratorModeDisableName is the name of the disable mode
	ArbitratorModeDisableName = "disable"
	// DefMaxLimit is the default maximum limit of mem quota
	DefMaxLimit int64 = 5e15

	defTaskTickDur                            = time.Millisecond * 10
	defMinHeapFreeBPS                  int64  = 100 * byteSizeMB
	defHeapReclaimCheckDuration               = time.Second * 1
	defHeapReclaimCheckMaxDuration            = time.Second * 5
	defOOMRiskRatio                           = 0.95
	defMemRiskRatio                           = 0.9
	defTickDurMilli                           = kilo * 1             // 1s
	defStorePoolMediumCapDurMilli             = defTickDurMilli * 10 // 10s
	defTrackMemStatsDurMilli                  = kilo * 1
	defMax                             int64  = 9e15
	defServerlimitSmallLimitNum               = 1000
	defServerlimitMinUnitNum                  = 500
	defServerlimitMaxUnitNum                  = 100
	defUpdateMemConsumedTimeAlignSec          = 30
	defUpdateMemMagnifUtimeAlign              = 30
	defUpdateBufferTimeAlignSec               = 60
	defRedundancy                             = 2
	defPoolReservedQuota                      = byteSizeMB
	defAwaitFreePoolAllocAlignSize            = defPoolReservedQuota + byteSizeMB
	defAwaitFreePoolShardNum           int64  = 256
	defAwaitFreePoolShrinkDurMilli            = kilo * 2
	defPoolStatusShards                       = 128
	defPoolQuotaShards                        = 27 // quota >= BaseQuotaUnit * 2^(max_shards - 2) will be put into the last shard
	prime64                            uint64 = 1099511628211
	initHashKey                        uint64 = 14695981039346656037
	defKillCancelCheckTimeout                 = time.Second * 20
	defDigestProfileSmallMemTimeoutSec        = 60 * 60 * 24     // 1 day
	defDigestProfileMemTimeoutSec             = 60 * 60 * 24 * 7 // 1 week
	baseQuotaUnit                             = 4 * byteSizeKB   // 4KB
	defMaxMagnif                              = kilo * 10
	defMaxDigestProfileCacheLimit             = 4e4
)

// ArbitratorWorkMode represents the work mode of the arbitrator: Standard, Priority, Disable
type ArbitratorWorkMode int32

const (
	// ArbitratorModeStandard indicates the standard mode
	ArbitratorModeStandard ArbitratorWorkMode = iota
	// ArbitratorModePriority indicates the priority mode
	ArbitratorModePriority
	// ArbitratorModeDisable indicates the mem-arbitrator is disabled
	ArbitratorModeDisable

	maxArbitratorMode
)

// ArbitrationPriority represents the priority of the task: Low, Medium, High
type ArbitrationPriority int32

type entryExecState int32

const (
	execStateIdle entryExecState = iota
	execStateRunning
	execStatePrivileged
)

const (
	// ArbitrationPriorityLow indicates the low priority
	ArbitrationPriorityLow ArbitrationPriority = iota
	// ArbitrationPriorityMedium indicates the medium priority
	ArbitrationPriorityMedium
	// ArbitrationPriorityHigh indicates the high priority
	ArbitrationPriorityHigh

	minArbitrationPriority = ArbitrationPriorityLow
	maxArbitrationPriority = ArbitrationPriorityHigh + 1
	maxArbitrateMode       = maxArbitrationPriority + 1

	// ArbitrationWaitAverse indicates the wait-averse property
	ArbitrationWaitAverse = maxArbitrationPriority
)

var errArbitrateFailError = errors.New("failed to allocate resource from arbitrator")

var arbitrationPriorityNames = [maxArbitrationPriority]string{"LOW", "MEDIUM", "HIGH"}

// String returns the string representation of the ArbitrationPriority
func (p ArbitrationPriority) String() string {
	return arbitrationPriorityNames[p]
}

var arbitratorWorkModeNames = []string{ArbitratorModeStandardName, ArbitratorModePriorityName, ArbitratorModeDisableName}

// String returns the string representation of the ArbitratorWorkMode
func (m ArbitratorWorkMode) String() string {
	return arbitratorWorkModeNames[m]
}

func (m *MemArbitrator) taskNumByPriority(priority ArbitrationPriority) int64 {
	return m.tasks.fifoByPriority[priority].approxSize()
}

func (m *MemArbitrator) taskNumOfWaitAverse() int64 {
	return m.tasks.fifoWaitAverse.approxSize()
}

func (m *MemArbitrator) firstTaskEntry(priority ArbitrationPriority) *rootPoolEntry {
	return m.tasks.fifoByPriority[priority].front()
}

func (m *MemArbitrator) removeTaskImpl(entry *rootPoolEntry) bool {
	if entry.taskMu.fifo.valid() {
		m.tasks.fifoTasks.remove(entry.taskMu.fifo)
		entry.taskMu.fifo.reset()
		m.tasks.fifoByPriority[entry.taskMu.fifoByPriority.priority].remove(entry.taskMu.fifoByPriority.wrapListElement)
		entry.taskMu.fifoByPriority.reset()
		if entry.taskMu.fifoWaitAverse.valid() {
			m.tasks.fifoWaitAverse.remove(entry.taskMu.fifoWaitAverse)
			entry.taskMu.fifoWaitAverse.reset()
		}
		return true
	}
	return false
}

// there is no need to wind up if task has been removed by cancel.
func (m *MemArbitrator) removeTask(entry *rootPoolEntry) (res bool) {
	m.tasks.Lock()

	res = m.removeTaskImpl(entry)

	m.tasks.Unlock()
	return res
}

func (m *MemArbitrator) addTask(entry *rootPoolEntry) {
	m.tasks.waitingAlloc.Add(entry.request.quota) // before tasks lock
	{
		m.tasks.Lock()

		priority := entry.ctx.memPriority
		entry.taskMu.fifoByPriority.priority = priority
		entry.taskMu.fifoByPriority.wrapListElement = m.tasks.fifoByPriority[priority].pushBack(entry)
		if entry.ctx.waitAverse {
			entry.taskMu.fifoWaitAverse = m.tasks.fifoWaitAverse.pushBack(entry)
		}
		entry.taskMu.fifo = m.tasks.fifoTasks.pushBack(entry)

		m.tasks.Unlock()
	}
}

func (m *MemArbitrator) frontTaskEntry() (entry *rootPoolEntry) {
	m.tasks.Lock()

	entry = m.tasks.fifoTasks.front()

	m.tasks.Unlock()
	return
}

func (m *MemArbitrator) extractFirstTaskEntry() (entry *rootPoolEntry) {
	m.tasks.Lock()

	if m.privilegedEntry != nil {
		if m.privilegedEntry.taskMu.fifo.valid() {
			m.tasks.fifoTasks.moveToFront(m.privilegedEntry.taskMu.fifo)
			entry = m.privilegedEntry
		}
	}

	if entry == nil {
		if m.execMu.mode == ArbitratorModePriority {
			for priority := maxArbitrationPriority - 1; priority >= minArbitrationPriority; priority-- {
				if entry = m.firstTaskEntry(priority); entry != nil {
					break
				}
			}
		} else {
			entry = m.tasks.fifoTasks.front()
		}
	}

	m.tasks.Unlock()
	return
}

type rootPoolEntry struct {
	pool   *ResourcePool
	taskMu struct { // protected by the tasks mutex of arbitrator
		fifo           wrapListElement
		fifoWaitAverse wrapListElement
		fifoByPriority struct {
			wrapListElement
			priority ArbitrationPriority
		}
	}

	// context of execution
	// mutable when entry is idle and the mutex of root pool is locked
	ctx struct {
		atomic.Pointer[ArbitrationContext]
		cancelCh <-chan struct{}

		// properties hint of the entry; data race is acceptable;
		memPriority     ArbitrationPriority
		waitAverse      bool
		preferPrivilege bool
	}
	request struct {
		resultCh chan ArbitrateResult // arbitrator will send result in `windup
		quota    int64                // mutable for ResourcePool
	}
	arbitratorMu struct { // mutable for arbitrator
		shard       *entryMapShard
		quotaShard  *entryQuotaShard
		underKill   entryKillCancelCtx
		underCancel entryKillCancelCtx
		quota       int64 // -1: uninitiated
		destroyed   bool
	}
	stateMu struct {
		quotaToReclaim atomic.Int64
		sync.Mutex
		stop atomic.Bool

		// execStateIdle -> execStateRunning -> execStatePrivileged -> execStateIdle
		// execStateIdle -> execStateRunning -> execStateIdle
		exec entryExecState
	}
}

type mapUIDEntry map[uint64]*rootPoolEntry

type entryMapShard struct {
	entries mapUIDEntry
	sync.RWMutex
}

type entryQuotaShard struct {
	entries mapUIDEntry
}

func (e *rootPoolEntry) execState() entryExecState {
	return entryExecState(atomic.LoadInt32((*int32)(&e.stateMu.exec)))
}

func (e *rootPoolEntry) setExecState(s entryExecState) {
	atomic.StoreInt32((*int32)(&e.stateMu.exec), int32(s))
}

func (e *rootPoolEntry) intoExecPrivileged() bool {
	return atomic.CompareAndSwapInt32((*int32)(&e.stateMu.exec), int32(execStateRunning), int32(execStatePrivileged))
}

func (e *rootPoolEntry) notRunning() bool {
	return e.stateMu.stop.Load() || e.execState() == execStateIdle || e.stateMu.quotaToReclaim.Load() > 0
}

type entryMap struct {
	quotaShards  [maxArbitrationPriority][]*entryQuotaShard // entries order by priority, quota
	contextCache struct {                                   // cache for traversing all entries concurrently
		sync.Map // map[uint64]*rootPoolEntry
		num      atomic.Int64
	}
	shards                    []*entryMapShard
	shardsMask                uint64
	maxQuotaShardIndex        int // for quota >= `BaseQuotaUnit * 2^(maxQuotaShard - 1)`
	minQuotaShardIndexToCheck int // ignore the pool with smaller quota
}

// controlled by arbitrator
func (m *entryMap) delete(entry *rootPoolEntry) {
	uid := entry.pool.uid

	if entry.arbitratorMu.quotaShard != nil {
		delete(entry.arbitratorMu.quotaShard.entries, uid)
		entry.arbitratorMu.quota = 0
		entry.arbitratorMu.quotaShard = nil
	}

	entry.arbitratorMu.shard.delete(uid)
	entry.arbitratorMu.shard = nil

	if _, loaded := m.contextCache.LoadAndDelete(uid); loaded {
		m.contextCache.num.Add(-1)
	}
}

// controlled by arbitrator
func (m *entryMap) addQuota(entry *rootPoolEntry, delta int64) {
	if delta == 0 {
		return
	}

	uid := entry.pool.UID()

	entry.arbitratorMu.quota += delta

	if entry.arbitratorMu.quota == 0 { // remove
		delete(entry.arbitratorMu.quotaShard.entries, uid)
		entry.arbitratorMu.quotaShard = nil
		return
	}

	newPos := getQuotaShard(entry.arbitratorMu.quota, m.maxQuotaShardIndex)
	newShard := m.quotaShards[entry.ctx.memPriority][newPos]
	if newShard != entry.arbitratorMu.quotaShard {
		if entry.arbitratorMu.quotaShard != nil {
			delete(entry.arbitratorMu.quotaShard.entries, uid)
		}
		entry.arbitratorMu.quotaShard = newShard
		entry.arbitratorMu.quotaShard.entries[uid] = entry
	}
}

func (m *entryMap) getStatusShard(key uint64) *entryMapShard {
	return m.shards[shardIndexByUID(key, m.shardsMask)]
}

func (m *entryMap) getQuotaShard(priority ArbitrationPriority, quota int64) *entryQuotaShard {
	return m.quotaShards[priority][getQuotaShard(quota, m.maxQuotaShardIndex)]
}

func (s *entryMapShard) get(key uint64) (e *rootPoolEntry, ok bool) {
	s.RLock()
	e, ok = s.entries[key]
	s.RUnlock()
	return
}

func (s *entryMapShard) delete(key uint64) {
	s.Lock()

	delete(s.entries, key)

	s.Unlock()
}

func (s *entryMapShard) emplace(key uint64, tar *rootPoolEntry) (e *rootPoolEntry, ok bool) {
	s.Lock()

	e, found := s.entries[key]
	ok = !found
	if !found {
		s.entries[key] = tar
		e = tar
	}

	s.Unlock()
	return
}

func (m *entryMap) emplace(pool *ResourcePool) (*rootPoolEntry, bool) {
	key := pool.UID()
	s := m.getStatusShard(key)
	if v, ok := s.get(key); ok {
		return v, false
	}
	tar := &rootPoolEntry{pool: pool}
	tar.arbitratorMu.shard = s
	tar.request.resultCh = make(chan ArbitrateResult, 1)

	return s.emplace(key, tar)
}

func (m *entryMap) init(shardNum uint64, maxQuotaShard int, minQuotaForReclaim int64) {
	m.shards = make([]*entryMapShard, shardNum)
	m.shardsMask = shardNum - 1
	m.maxQuotaShardIndex = maxQuotaShard
	m.minQuotaShardIndexToCheck = getQuotaShard(minQuotaForReclaim, m.maxQuotaShardIndex)
	for p := minArbitrationPriority; p < maxArbitrationPriority; p++ {
		m.quotaShards[p] = make([]*entryQuotaShard, m.maxQuotaShardIndex)
		for i := range m.maxQuotaShardIndex {
			m.quotaShards[p][i] = &entryQuotaShard{
				entries: make(mapUIDEntry),
			}
		}
	}

	for i := range shardNum {
		m.shards[i] = &entryMapShard{
			entries: make(mapUIDEntry),
		}
	}
}

// if entry is in task queue, it must have acquire the unique request lock and wait for callback
// this func only can be invoked after `removeTask`
func (e *rootPoolEntry) windUp(delta int64, r ArbitrateResult) {
	e.pool.forceAddCap(delta)
	e.request.resultCh <- r
}

// non thread safe: the mutex of root pool must have been locked
func (m *MemArbitrator) blockingAllocate(entry *rootPoolEntry, requestedBytes int64) ArbitrateResult {
	if entry.execState() == execStateIdle {
		return ArbitrateFail
	}

	m.prepareAlloc(entry, requestedBytes)
	return m.waitAlloc(entry)
}

// non thread safe: the mutex of root pool must have been locked
func (m *MemArbitrator) prepareAlloc(entry *rootPoolEntry, requestedBytes int64) {
	entry.request.quota = requestedBytes
	m.addTask(entry)
	m.notifer.WeakWake()
}

// non thread safe: the mutex of root pool must have been locked
func (m *MemArbitrator) waitAlloc(entry *rootPoolEntry) ArbitrateResult {
	res := ArbitrateOk
	select {
	case res = <-entry.request.resultCh:
		if res == ArbitrateFail {
			atomic.AddInt64(&m.execMetrics.Task.Fail, 1)
		} else {
			atomic.AddInt64(&m.execMetrics.Task.Succ, 1)
		}
	case <-entry.ctx.cancelCh:
		// 1. cancel by session
		// 2. stop by the arbitrate-helper (cancel / kill by arbitrator)
		res = ArbitrateFail
		atomic.AddInt64(&m.execMetrics.Task.Fail, 1)

		if !m.removeTask(entry) {
			<-entry.request.resultCh
		}
	}

	m.tasks.waitingAlloc.Add(-entry.request.quota)
	entry.request.quota = 0

	return res
}

type blockedState struct {
	allocated int64
	utimeSec  int64
}

// PoolAllocProfile represents the profile of root pool allocation in the mem-arbitrator
type PoolAllocProfile struct {
	SmallPoolLimit   int64 // limit / 1000
	PoolAllocUnit    int64 // limit / 500
	MaxPoolAllocUnit int64 // limit / 100
}

// MemArbitrator represents the main structure aka `mem-arbitrator`
type MemArbitrator struct {
	execMu struct {
		startTime    time.Time          // start time of each round
		blockedState blockedState       // blocked state during arbitration
		mode         ArbitratorWorkMode // work mode of each round
	}
	actions   MemArbitratorActions // actions interfaces
	controlMu struct {             // control the async work process
		finishCh chan struct{}
		sync.Mutex
		running atomic.Bool
	}
	debug           struct{ now func() time.Time } // mock time.Now
	privilegedEntry *rootPoolEntry                 // entry with privilege will always be satisfied first
	underKill       mapEntryWithMem                // entries under `KILL` operation
	underCancel     mapEntryWithMem                // entries under `CANCEL` operation
	notifer         Notifer                        // wake up the async work process
	cleanupMu       struct {                       // cleanup the state of the entry
		fifoTasks wrapList[*rootPoolEntry]
		sync.Mutex
	}
	tasks struct {
		fifoByPriority [maxArbitrationPriority]wrapList[*rootPoolEntry] // tasks by priority
		fifoTasks      wrapList[*rootPoolEntry]                         // all tasks in FIFO order
		fifoWaitAverse wrapList[*rootPoolEntry]                         // tasks with wait-averse property
		waitingAlloc   atomic.Int64                                     // total waiting allocation size
		sync.Mutex
	}
	digestProfileCache struct {
		shards     []digestProfileShard
		shardsMask uint64
		num        atomic.Int64
		limit      int64 // max number of digest profiles; shrink to limit/2 when num > limit;
	}
	entryMap  entryMap // sharded hash map & ordered quota map
	awaitFree struct { // await-free pool
		pool   *ResourcePool
		budget struct { // fixed size budget shards
			shards   []TrackedConcurrentBudget
			sizeMask uint64
		}
		lastQuotaUsage       memPoolQuotaUsage // tracked heap memory usage & quota usage
		lastShrinkUtimeMilli atomic.Int64
	}

	heapController heapController // monitor runtime mem stats; resolve mem issues; record mem profiles;

	poolAllocStats struct { // statistics of root pool allocation
		sync.RWMutex
		PoolAllocProfile
		mediumQuota atomic.Int64 // medium (max quota usage of root pool)
		timedMap    [2 + defRedundancy]struct {
			sync.RWMutex
			statisticsTimedMapElement
		}
		lastUpdateUtimeMilli atomic.Int64
	}

	buffer buffer // reserved buffer quota which only works under priority mode

	mu struct {
		sync.Mutex
		_         cpuCacheLinePad
		allocated int64  // allocated mem quota
		released  uint64 // total released mem quota
		lastGC    uint64 // total released mem quota at last GC point
		_         cpuCacheLinePad
		limit     int64 // hard limit of mem quota which is same as the server limit
		threshold struct {
			risk    int64 // threshold of mem risk
			oomRisk int64 // threshold of oom risk
		}
		softLimit struct {
			mode      SoftLimitMode
			size      int64
			specified struct {
				size  int64
				ratio int64 // ratio of soft-limit to hard-limit
			}
		}
	}
	execMetrics execMetricsCounter // execution metrics
	avoidance   struct {
		size        atomic.Int64 // size of quota cannot be allocated
		heapTracked struct {     // tracked heap memory usage
			atomic.Int64
			lastUpdateUtimeMilli atomic.Int64
		}
		memMagnif struct { // memory pressure magnification factor: ratio of runtime memory usage to quota usage
			sync.Mutex
			ratio atomic.Int64
		}
		awaitFreeBudgetKickOutIdx uint64 // round-robin index to clean await-free pool budget when quota is insufficient
	}
	tickTask struct { // periodic task
		sync.Mutex
		lastTickUtimeMilli atomic.Int64
	}
	UnixTimeSec int64 // approximate unix time in seconds
	rootPoolNum atomic.Int64
	mode        ArbitratorWorkMode
}

type buffer struct {
	size       atomic.Int64 // approximate max quota usage of root pool
	quotaLimit atomic.Int64
	timedMap   [2 + defRedundancy]struct {
		sync.RWMutex
		wrapTimeSizeQuota
	}
}

func (m *MemArbitrator) setBufferSize(v int64) {
	m.buffer.size.Store(v)
}

func (m *MemArbitrator) setQuotaLimit(v int64) {
	m.buffer.quotaLimit.Store(v)
}

type digestProfileShard struct {
	sync.Map //map[uint64]*digestProfile
	num      atomic.Int64
}

// MemArbitratorActions represents the actions of the mem-arbitrator
type MemArbitratorActions struct {
	Info, Warn, Error func(format string, args ...zap.Field) // log actions

	UpdateRuntimeMemStats func() // update runtime memory statistics
	GC                    func() // garbage collection
}

type awaitFreePoolExecMetrics struct {
	pairSuccessFail
	Shrink      int64
	ForceShrink int64
}

type pairSuccessFail struct{ Succ, Fail int64 }

// NumByPriority represents the number of tasks by priority
type NumByPriority [maxArbitrationPriority]int64

type execMetricsAction struct {
	GC                    int64
	UpdateRuntimeMemStats int64
	RecordMemState        pairSuccessFail
}

type execMetricsRisk struct {
	Mem     int64
	OOM     int64
	OOMKill NumByPriority
}

type execMetricsCancel struct {
	StandardMode int64
	PriorityMode NumByPriority
	WaitAverse   int64
}

type execMetricsTask struct {
	pairSuccessFail               // all work modes
	SuccByPriority  NumByPriority // priority mode
}

type execMetricsCounter struct {
	Task         execMetricsTask
	Cancel       execMetricsCancel
	AwaitFree    awaitFreePoolExecMetrics
	Action       execMetricsAction
	Risk         execMetricsRisk
	ShrinkDigest int64
}

// ExecMetrics returns the reference of the execution metrics
//
//go:norace
func (m *MemArbitrator) ExecMetrics() execMetricsCounter {
	if m == nil {
		return execMetricsCounter{}
	}
	return m.execMetrics
}

// SetWorkMode sets the work mode of the mem-arbitrator
func (m *MemArbitrator) SetWorkMode(newMode ArbitratorWorkMode) (oriMode ArbitratorWorkMode) {
	oriMode = ArbitratorWorkMode(atomic.SwapInt32((*int32)(&m.mode), int32(newMode)))
	m.wake()
	return
}

// WorkMode returns the current work mode of the mem-arbitrator
func (m *MemArbitrator) WorkMode() ArbitratorWorkMode {
	if m == nil {
		return ArbitratorModeDisable
	}
	return m.workMode()
}

// PoolAllocProfile returns the profile of root pool allocation in the mem-arbitrator
func (m *MemArbitrator) PoolAllocProfile() (res PoolAllocProfile) {
	limit := m.limit()
	return PoolAllocProfile{
		SmallPoolLimit:   max(1, limit/defServerlimitSmallLimitNum),
		PoolAllocUnit:    max(1, limit/defServerlimitMinUnitNum),
		MaxPoolAllocUnit: max(1, limit/defServerlimitMaxUnitNum),
	}
}

func (m *MemArbitrator) workMode() ArbitratorWorkMode {
	return ArbitratorWorkMode(atomic.LoadInt32((*int32)(&m.mode)))
}

// GetDigestProfileCache returns the digest profile cache for a given digest-id and utime
func (m *MemArbitrator) GetDigestProfileCache(digestID uint64, utimeSec int64) (int64, bool) {
	d := &m.digestProfileCache.shards[digestID&m.digestProfileCache.shardsMask]
	e, ok := d.Load(digestID)
	if !ok {
		return 0, false
	}

	pf := e.(*digestProfile)

	if utimeSec > pf.lastFetchUtimeSec.Load() {
		pf.lastFetchUtimeSec.Store(utimeSec)
	}

	return pf.maxVal.Load(), true
}

func (m *MemArbitrator) shrinkDigestProfile(utimeSec int64, limit, shrinkTo int64) (shrinkedNum int64) {
	if m.digestProfileCache.num.Load() <= limit {
		return
	}

	m.execMetrics.ShrinkDigest++

	var valMap [defPoolQuotaShards]int

	for i := range m.digestProfileCache.shards {
		d := &m.digestProfileCache.shards[i]
		if d.num.Load() == 0 {
			continue
		}
		dn := int64(0)
		d.Range(func(k, v any) bool {
			pf := v.(*digestProfile)
			maxVal := pf.maxVal.Load()
			{ // try to delete timeout cache
				needDelete := false

				if maxVal > m.poolAllocStats.SmallPoolLimit {
					if utimeSec-pf.lastFetchUtimeSec.Load() > defDigestProfileMemTimeoutSec {
						needDelete = true
					}
				} else { // small max-val
					if utimeSec-pf.lastFetchUtimeSec.Load() > defDigestProfileSmallMemTimeoutSec {
						needDelete = true
					}
				}

				if needDelete {
					if _, loaded := d.LoadAndDelete(k); loaded {
						d.num.Add(-1)
						dn++
						return true
					}
				}
			}
			index := getQuotaShard(maxVal, defPoolQuotaShards)
			valMap[index]++
			return true
		})
		m.digestProfileCache.num.Add(-dn)
		shrinkedNum += dn
	}

	toShinkNum := m.digestProfileCache.num.Load() - shrinkTo
	if toShinkNum <= 0 {
		return
	}

	shrinkMaxSize := DefMaxLimit
	{ // find the max size to shrink
		n := int64(0)
		for i := range defPoolQuotaShards {
			if n += int64(valMap[i]); n >= toShinkNum {
				shrinkMaxSize = baseQuotaUnit * (1 << i)
				break
			}
		}
	}

	for i := range m.digestProfileCache.shards {
		d := &m.digestProfileCache.shards[i]
		if d.num.Load() == 0 {
			continue
		}
		dn := int64(0)
		d.Range(func(k, v any) bool {
			if pf := v.(*digestProfile); pf.maxVal.Load() < shrinkMaxSize {
				if _, loaded := d.LoadAndDelete(k); loaded {
					d.num.Add(-1)
					toShinkNum--
					dn++
				}
			}

			return toShinkNum > 0
		})
		m.digestProfileCache.num.Add(-dn)
		shrinkedNum += dn

		if toShinkNum <= 0 {
			break
		}
	}

	return
}

// UpdateDigestProfileCache updates the digest profile cache for a given digest-id
func (m *MemArbitrator) UpdateDigestProfileCache(digestID uint64, memConsumed int64, utimeSec int64) {
	d := &m.digestProfileCache.shards[digestID&m.digestProfileCache.shardsMask]
	var pf *digestProfile
	if e, ok := d.Load(digestID); ok {
		pf = e.(*digestProfile)
	} else {
		pf = &digestProfile{}
		if actual, loaded := d.LoadOrStore(digestID, pf); loaded {
			pf = actual.(*digestProfile)
		} else {
			d.num.Add(1)
			m.digestProfileCache.num.Add(1)
		}
	}

	const maxNum = int64(len(pf.timedMap))
	const maxDur = maxNum - defRedundancy

	tsAlign := utimeSec / defUpdateBufferTimeAlignSec
	tar := &pf.timedMap[tsAlign%maxNum]

	if oriTs := tar.tsAlign.Load(); oriTs < tsAlign && oriTs != 0 {
		tar.Lock()

		if oriTs = tar.tsAlign.Load(); oriTs < tsAlign && oriTs != 0 {
			tar.wrapTimeMaxval = wrapTimeMaxval{}
		}

		tar.Unlock()
	}

	tar.RLock()

	updateSize := false
	cleanNext := false

	if tar.tsAlign.Load() == 0 {
		if tar.tsAlign.CompareAndSwap(0, tsAlign) {
			cleanNext = true
		}
	}

	for oldVal := tar.maxVal.Load(); oldVal < memConsumed; oldVal = tar.maxVal.Load() {
		if tar.maxVal.CompareAndSwap(oldVal, memConsumed) {
			updateSize = true
			break
		}
	}

	if updateSize {
		maxv := tar.maxVal.Load()
		// tsAlign-1, tsAlign
		for i := range maxDur {
			d := &pf.timedMap[(maxNum+tsAlign-i)%maxNum]

			if ts := d.tsAlign.Load(); ts > tsAlign-maxDur && ts <= tsAlign {
				maxv = max(maxv, d.maxVal.Load())
			}
		}
		pf.maxVal.CompareAndSwap(pf.maxVal.Load(), maxv) // force update
	}

	tar.RUnlock()

	if utimeSec > pf.lastFetchUtimeSec.Load() {
		pf.lastFetchUtimeSec.Store(utimeSec)
	}

	if cleanNext {
		d := &pf.timedMap[(tsAlign+1)%maxNum]
		d.Lock()

		if ts := d.tsAlign.Load(); ts < (tsAlign+1) && ts != 0 {
			d.wrapTimeMaxval = wrapTimeMaxval{}
		}

		d.Unlock()
	}
}

type digestProfile struct {
	maxVal   atomic.Int64
	timedMap [2 + defRedundancy]struct {
		sync.RWMutex
		wrapTimeMaxval
	}
	lastFetchUtimeSec atomic.Int64
}

type wrapTimeMaxval struct {
	tsAlign atomic.Int64
	maxVal  atomic.Int64
}

type wrapTimeSizeQuota struct {
	ts    atomic.Int64
	size  atomic.Int64
	quota atomic.Int64
}

type statisticsTimedMapElement struct {
	tsAlign atomic.Int64
	slot    [defServerlimitMinUnitNum]uint32
	num     atomic.Uint64
}

type entryKillCancelCtx struct {
	startTime time.Time
	reclaim   int64
	start     bool
	fail      bool
}

type mapEntryWithMem struct {
	entries mapUIDEntry
	num     int64
}

func (x *mapEntryWithMem) delete(entry *rootPoolEntry) {
	delete(x.entries, entry.pool.uid)
	x.num--
}

//go:norace
func (x *mapEntryWithMem) approxSize() int64 {
	return x.num
}

func (x *mapEntryWithMem) init() {
	x.entries = make(mapUIDEntry)
}

func (x *mapEntryWithMem) add(entry *rootPoolEntry) {
	x.entries[entry.pool.uid] = entry
	x.num++
}

func (m *MemArbitrator) addUnderKill(entry *rootPoolEntry, memoryUsed int64, startTime time.Time) {
	if !entry.arbitratorMu.underKill.start {
		m.underKill.add(entry)
		entry.arbitratorMu.underKill = entryKillCancelCtx{
			start:     true,
			startTime: startTime,
			reclaim:   memoryUsed,
		}
	}
}

func (m *MemArbitrator) addUnderCancel(entry *rootPoolEntry, memoryUsed int64, startTime time.Time) {
	if !entry.arbitratorMu.underCancel.start {
		m.underCancel.add(entry)
		entry.arbitratorMu.underCancel = entryKillCancelCtx{
			start:     true,
			startTime: startTime,
			reclaim:   memoryUsed,
		}
	}
}

func (m *MemArbitrator) deleteUnderKill(entry *rootPoolEntry) {
	if entry.arbitratorMu.underKill.start {
		m.underKill.delete(entry)
		entry.arbitratorMu.underKill.start = false

		m.warnKillCancel(entry, &entry.arbitratorMu.underKill, "Finish to `KILL` root pool")
	}
}

func (m *MemArbitrator) deleteUnderCancel(entry *rootPoolEntry) {
	if entry.arbitratorMu.underCancel.start {
		m.underCancel.delete(entry)
		entry.arbitratorMu.underCancel.start = false
	}
}

type memProfile struct {
	startUtimeMilli int64
	tsAlign         int64
	heap            int64 // max heap-alloc size after GC
	quota           int64 // max quota allocated when failed to arbitrate
	ratio           int64 // heap / quota
}

type heapController struct {
	memStateRecorder struct {
		RecordMemState
		lastMemState         atomic.Pointer[RuntimeMemStateV1]
		lastRecordUtimeMilli atomic.Int64
		sync.Mutex
	}
	memRisk struct {
		startTime struct {
			t    time.Time
			nano atomic.Int64
		}
		lastMemStats struct {
			startTime     time.Time
			heapTotalFree int64
		}
		minHeapFreeBPS int64
		oomRisk        bool
	}
	timedMemProfile [2]memProfile
	lastGC          struct {
		heapAlloc atomic.Int64 // heap alloc size after GC
		utime     atomic.Int64 // end time of last GC
	}
	heapTotalFree atomic.Int64
	heapAlloc     atomic.Int64 // heap-alloc <= heap-inuse
	heapInuse     atomic.Int64
	memOffHeap    atomic.Int64 // off-heap memory: `stack` + `gc` + `other` + `meta` ...
	memInuse      atomic.Int64 // heap-inuse + off-heap: must be less than runtime-limit to avoid Heavy GC / OOM
	sync.Mutex
}

func (m *MemArbitrator) lastMemState() (res *RuntimeMemStateV1) {
	res = m.heapController.memStateRecorder.lastMemState.Load()
	return
}

// RecordMemState is an interface for recording runtime memory state
type RecordMemState interface {
	Load() (*RuntimeMemStateV1, error)
	Store(*RuntimeMemStateV1) error
}

func (m *MemArbitrator) recordMemConsumed(memConsumed, utimeSec int64) {
	m.poolAllocStats.RLock()
	defer m.poolAllocStats.RUnlock()

	const maxNum = int64(len(m.poolAllocStats.timedMap))

	tsAlign := utimeSec / defUpdateMemConsumedTimeAlignSec
	tar := &m.poolAllocStats.timedMap[tsAlign%maxNum]

	if oriTs := tar.tsAlign.Load(); oriTs < tsAlign && oriTs != 0 {
		tar.Lock()

		if oriTs = tar.tsAlign.Load(); oriTs < tsAlign && oriTs != 0 {
			tar.statisticsTimedMapElement = statisticsTimedMapElement{}
		}

		tar.Unlock()
	}

	cleanNext := false
	{
		tar.RLock()

		if tar.tsAlign.Load() == 0 {
			if tar.tsAlign.CompareAndSwap(0, tsAlign) {
				cleanNext = true
			}
		}

		{
			pos := min(memConsumed/m.poolAllocStats.PoolAllocUnit, defServerlimitMinUnitNum-1)
			atomic.AddUint32(&tar.slot[pos], 1)
			tar.num.Add(1)
		}

		tar.RUnlock()
	}

	if cleanNext {
		d := &m.poolAllocStats.timedMap[(tsAlign+1)%maxNum]
		d.Lock()

		if v := d.tsAlign.Load(); v < (tsAlign+1) && v != 0 {
			d.statisticsTimedMapElement = statisticsTimedMapElement{}
		}

		d.Unlock()
	}
}

func (m *MemArbitrator) tryToUpdateBuffer(memConsumed, memQuotaLimit, utimeSec int64) {
	const maxNum = int64(len(m.buffer.timedMap))
	const maxDur = maxNum - defRedundancy

	tsAlign := utimeSec / defUpdateBufferTimeAlignSec
	tar := &m.buffer.timedMap[tsAlign%maxNum]

	if oriTs := tar.ts.Load(); oriTs < tsAlign && oriTs != 0 {
		tar.Lock()

		if oriTs = tar.ts.Load(); oriTs < tsAlign && oriTs != 0 {
			tar.wrapTimeSizeQuota = wrapTimeSizeQuota{}
		}

		tar.Unlock()
	}

	tar.RLock()

	updateSize := false
	updateQuota := false
	cleanNext := false

	if ts := tar.ts.Load(); ts == 0 {
		if tar.ts.CompareAndSwap(0, tsAlign) {
			cleanNext = true
		}
	}

	for oldVal := tar.size.Load(); oldVal < memConsumed; oldVal = tar.size.Load() {
		if tar.size.CompareAndSwap(oldVal, memConsumed) {
			updateSize = true
			break
		}
	}

	for oldVal := tar.quota.Load(); oldVal < memQuotaLimit; oldVal = tar.quota.Load() {
		if tar.quota.CompareAndSwap(oldVal, memQuotaLimit) {
			updateQuota = true
			break
		}
	}

	if updateSize || updateQuota {
		// tsAlign-1, tsAlign
		for i := range maxDur {
			d := &m.buffer.timedMap[(maxNum+tsAlign-i)%maxNum]

			if ts := d.ts.Load(); ts > tsAlign-maxDur && ts <= tsAlign {
				memConsumed = max(memConsumed, d.size.Load())
				memQuotaLimit = max(memQuotaLimit, d.quota.Load())
			}
		}
		if updateSize && m.buffer.size.Load() != memConsumed {
			m.setBufferSize(memConsumed)
		}
		if updateQuota && m.buffer.quotaLimit.Load() != memQuotaLimit {
			m.setQuotaLimit(memQuotaLimit)
		}
	}

	tar.RUnlock()

	if cleanNext {
		d := &m.buffer.timedMap[(tsAlign+1)%maxNum]
		d.Lock()

		if v := d.ts.Load(); v < tsAlign+1 && v != 0 {
			d.wrapTimeSizeQuota = wrapTimeSizeQuota{}
		}

		d.Unlock()
	}
}

func (m *MemArbitrator) gc() {
	m.mu.lastGC = m.mu.released
	if m.actions.GC != nil {
		m.actions.GC()
	}
	atomic.AddInt64(&m.execMetrics.Action.GC, 1)
}

func (m *MemArbitrator) reclaimHeap() {
	m.gc()
	m.refreshRuntimeMemStats() // refresh runtime mem stats after GC and record
}

func (m *MemArbitrator) setMinHeapFreeBPS(sz int64) {
	m.heapController.memRisk.minHeapFreeBPS = sz
}

func (m *MemArbitrator) minHeapFreeBPS() int64 {
	return m.heapController.memRisk.minHeapFreeBPS
}

// ResetRootPoolByID resets the root pool by ID and analyze the memory consumption info
func (m *MemArbitrator) ResetRootPoolByID(uid uint64, maxMemConsumed int64, tune bool) {
	entry := m.getRootPoolEntry(uid)
	if entry == nil {
		return
	}

	if tune {
		memQuotaLimit := int64(0)
		if ctx := entry.ctx.Load(); ctx != nil {
			memQuotaLimit = ctx.memQuotaLimit
		}

		m.tryToUpdateBuffer(
			maxMemConsumed,
			memQuotaLimit,
			m.approxUnixTimeSec())

		if maxMemConsumed > m.poolAllocStats.SmallPoolLimit {
			m.recordMemConsumed(
				maxMemConsumed,
				m.approxUnixTimeSec())
		}
	}

	m.resetRootPoolEntry(entry)
	m.wake()
}

func (m *MemArbitrator) resetRootPoolEntry(entry *rootPoolEntry) bool {
	{
		entry.stateMu.Lock()

		if entry.execState() == execStateIdle {
			entry.stateMu.Unlock()
			return false
		}
		entry.setExecState(execStateIdle)

		entry.stateMu.Unlock()
	}

	// aquiure the lock of root pool:
	// - wait for the alloc task to finish
	// - publish the state of entry
	if releasedSize := entry.pool.Stop(); releasedSize > 0 {
		entry.stateMu.quotaToReclaim.Add(releasedSize)
	}

	{
		m.cleanupMu.Lock()

		m.cleanupMu.fifoTasks.pushBack(entry)

		m.cleanupMu.Unlock()
	}

	return true
}

func (m *MemArbitrator) warnKillCancel(entry *rootPoolEntry, ctx *entryKillCancelCtx, reason string) {
	m.actions.Warn(
		reason,
		zap.Uint64("uid", entry.pool.uid),
		zap.String("name", entry.pool.name),
		zap.String("mem-priority", entry.ctx.memPriority.String()),
		zap.Int64("reclaimed", ctx.reclaim),
		zap.Time("start-time", ctx.startTime),
	)
}

// RemoveRootPoolByID removes & terminates the root pool by ID
func (m *MemArbitrator) RemoveRootPoolByID(uid uint64) bool {
	entry := m.getRootPoolEntry(uid)
	if entry == nil {
		return false
	}

	if m.removeRootPoolEntry(entry) {
		if ctx := entry.ctx.Load(); ctx != nil && ctx.arbitrateHelper != nil {
			ctx.arbitrateHelper.Finish()
		}
		m.wake()
		return true
	}
	return false
}

func (m *MemArbitrator) removeRootPoolEntry(entry *rootPoolEntry) bool {
	{
		entry.stateMu.Lock()

		if entry.stateMu.stop.Swap(true) {
			entry.stateMu.Unlock()
			return false
		}

		if entry.execState() != execStateIdle {
			entry.setExecState(execStateIdle)
		}

		entry.stateMu.Unlock()
	}

	// make the alloc task failed in arbitrator;
	{
		m.cleanupMu.Lock()

		m.cleanupMu.fifoTasks.pushBack(entry)

		m.cleanupMu.Unlock()
	}
	// aquiure the lock of root pool and clean up
	entry.pool.Stop()
	// any new lock of root pool must have sensed the exec state is idle

	return true
}

func (m *MemArbitrator) getRootPoolEntry(uid uint64) *rootPoolEntry {
	if e, ok := m.entryMap.getStatusShard(uid).get(uid); ok {
		return e
	}
	return nil
}

type rootPool struct {
	entry      *rootPoolEntry
	arbitrator *MemArbitrator
}

// FindRootPool finds the root pool by ID
func (m *MemArbitrator) FindRootPool(uid uint64) rootPool {
	if e := m.getRootPoolEntry(uid); e != nil {
		return rootPool{e, m}
	}
	return rootPool{}
}

// EmplaceRootPool emplaces a new root pool with the given uid (uid < 0 means the internal pool)
func (m *MemArbitrator) EmplaceRootPool(uid uint64) (rootPool, error) {
	if e := m.getRootPoolEntry(uid); e != nil {
		return rootPool{e, m}, nil
	}

	pool := &ResourcePool{
		name:           fmt.Sprintf("root-%d", uid),
		uid:            uid,
		limit:          DefMaxLimit,
		allocAlignSize: 1,
	}
	entry, err := m.addRootPool(pool)
	return rootPool{entry, m}, err
}

func (m *MemArbitrator) addRootPool(pool *ResourcePool) (*rootPoolEntry, error) {
	if b := pool.capacity(); b != 0 {
		return nil, fmt.Errorf("%s: has %d bytes budget left", pool.name, b)
	}
	if pool.mu.budget.pool != nil {
		return nil, fmt.Errorf("%s: already started with pool %s", pool.name, pool.mu.budget.pool.Name())
	}
	if pool.reserved != 0 {
		return nil, fmt.Errorf("%s: has %d reserved budget left", pool.name, pool.reserved)
	}

	entry, ok := m.entryMap.emplace(pool)

	if !ok {
		return nil, fmt.Errorf("%s: already exists", pool.name)
	}

	m.rootPoolNum.Add(1)
	return entry, nil
}

func (m *MemArbitrator) doAdjustSoftLimit() {
	var softLimit int64
	limit := m.limit()
	if m.mu.softLimit.mode == SoftLimitModeSpecified {
		if m.mu.softLimit.specified.size > 0 {
			softLimit = min(m.mu.softLimit.specified.size, limit)
		} else {
			softLimit = min(multiRatio(limit, m.mu.softLimit.specified.ratio), limit)
		}
	} else {
		softLimit = m.oomRisk()
	}
	m.mu.softLimit.size = softLimit
}

// SetSoftLimit sets the soft limit of the mem-arbitrator
func (m *MemArbitrator) SetSoftLimit(softLimit int64, sortLimitRatio float64, mode SoftLimitMode) {
	m.mu.Lock()

	m.mu.softLimit.mode = mode
	if mode == SoftLimitModeSpecified {
		m.mu.softLimit.specified.size = softLimit
		m.mu.softLimit.specified.ratio = intoRatio(sortLimitRatio)
	}
	m.doAdjustSoftLimit()

	m.mu.Unlock()
}

//go:norace
func (m *MemArbitrator) softLimit() int64 {
	return m.mu.softLimit.size
}

// SoftLimit returns the soft limit of the mem-arbitrator
func (m *MemArbitrator) SoftLimit() uint64 {
	if m == nil {
		return 0
	}
	return uint64(m.softLimit())
}

func (m *MemArbitrator) doSetLimit(limit int64) {
	m.mu.limit = limit
	m.mu.threshold.oomRisk = int64(float64(limit) * defOOMRiskRatio)
	m.mu.threshold.risk = int64(float64(limit) * defMemRiskRatio)
	m.doAdjustSoftLimit()
}

// SetLimit sets the limit of the mem-arbitrator and returns whether the limit has changed
func (m *MemArbitrator) SetLimit(x uint64) (changed bool) {
	newLimit := min(int64(x), DefMaxLimit)
	if newLimit <= 0 {
		return
	}

	needWake := false
	{
		m.mu.Lock()

		if limit := m.limit(); newLimit != limit {
			changed = true
			needWake = newLimit > limit // update to a greater limit
			m.doSetLimit(newLimit)
		}

		m.mu.Unlock()
	}

	if changed {
		m.resetStatistics()
	}

	if needWake {
		m.weakWake()
	}
	return
}

func (m *MemArbitrator) resetStatistics() {
	m.poolAllocStats.Lock()

	m.poolAllocStats.PoolAllocProfile = m.PoolAllocProfile()
	for i := range m.poolAllocStats.timedMap {
		m.poolAllocStats.timedMap[i].statisticsTimedMapElement = statisticsTimedMapElement{}
	}

	m.poolAllocStats.Unlock()
}

func (m *MemArbitrator) alloc(x int64) {
	m.mu.Lock()

	m.doAlloc(x)

	m.mu.Unlock()
}

func (m *MemArbitrator) doAlloc(x int64) {
	m.mu.allocated += x
}

func (m *MemArbitrator) release(x int64) {
	if x <= 0 {
		return
	}
	m.alloc(-x)
}

//go:norace
func (m *MemArbitrator) allocated() int64 {
	return m.mu.allocated
}

func (m *MemArbitrator) lastBlockedAt() (allocated, utimeSec int64) {
	return m.execMu.blockedState.allocated, m.execMu.blockedState.utimeSec
}

//go:norace
func (b *blockedState) reset() {
	*b = blockedState{}
}

//go:norace
func (m *MemArbitrator) updateBlockedAt() {
	m.execMu.blockedState = blockedState{m.allocated(), m.approxUnixTimeSec()}
}

// Allocated returns the allocated mem quota of the mem-arbitrator
func (m *MemArbitrator) Allocated() int64 {
	return m.allocated()
}

// OutOfControl returns the size of the out-of-control mem
func (m *MemArbitrator) OutOfControl() int64 {
	return m.avoidance.size.Load()
}

// WaitingAllocSize returns the pending alloc mem quota of the mem-arbitrator
func (m *MemArbitrator) WaitingAllocSize() int64 {
	return m.tasks.waitingAlloc.Load()
}

// TaskNum returns the number of pending tasks in the mem-arbitrator
func (m *MemArbitrator) TaskNum() int64 {
	return m.tasks.fifoTasks.approxSize()
}

// RootPoolNum returns the number of root pools in the mem-arbitrator
func (m *MemArbitrator) RootPoolNum() int64 {
	return m.rootPoolNum.Load()
}

//go:norace
func (m *MemArbitrator) limit() int64 {
	return m.mu.limit
}

// Limit returns the mem quota limit of the mem-arbitrator
func (m *MemArbitrator) Limit() uint64 {
	if m == nil {
		return 0
	}
	return uint64(m.limit())
}

func (m *MemArbitrator) allocateFromArbitrator(remainBytes int64, leastLeft int64) (bool, int64) {
	reclaimedBytes := int64(0)
	ok := false
	{
		m.mu.Lock()

		if m.allocated() <= m.limit()-leastLeft-remainBytes {
			m.doAlloc(remainBytes)
			reclaimedBytes += remainBytes
			ok = true
		} else if rest := m.limit() - leastLeft - m.allocated(); rest > 0 {
			m.doAlloc(rest)
			reclaimedBytes += rest
		}

		m.mu.Unlock()
	}

	return ok, reclaimedBytes
}

func (m *MemArbitrator) doReclaimMemByPriority(target *rootPoolEntry, remainBytes int64) {
	underReclaimBytes := int64(0)

	// check under canceling pool entries
	if m.underCancel.num > 0 {
		now := m.innerTime()
		for uid, entry := range m.underCancel.entries {
			ctx := &entry.arbitratorMu.underCancel
			if ctx.fail {
				continue
			}
			if deadline := ctx.startTime.Add(defKillCancelCheckTimeout); now.Compare(deadline) >= 0 {
				m.actions.Warn("Failed to `CANCEL` root pool due to timeout",
					zap.Uint64("uid", uid),
					zap.String("name", entry.pool.name),
					zap.Int64("quota-to-reclaim", ctx.reclaim),
					zap.String("mem-priority", entry.ctx.memPriority.String()),
					zap.Time("start-time", ctx.startTime),
					zap.Time("deadline", deadline),
				)
				ctx.fail = true
				continue
			}
			underReclaimBytes += ctx.reclaim
		}
	}

	// remain-bytes <= 0
	if underReclaimBytes >= remainBytes {
		return
	}

	// task whose mode is wait_averse must have been cleaned

	for prio := minArbitrationPriority; prio < target.ctx.memPriority; prio++ {
		for pos := m.entryMap.maxQuotaShardIndex - 1; pos >= m.entryMap.minQuotaShardIndexToCheck; pos-- {
			for _, entry := range m.entryMap.quotaShards[prio][pos].entries {
				if entry.arbitratorMu.underCancel.start || entry.notRunning() {
					continue
				}
				if ctx := entry.ctx.Load(); ctx.available() {
					m.execMetrics.Cancel.PriorityMode[prio]++
					ctx.stop(ArbitratorPriorityCancel)

					if m.removeTask(entry) {
						entry.windUp(0, ArbitrateFail)
					}
					m.addUnderCancel(entry, entry.arbitratorMu.quota, m.innerTime())
					underReclaimBytes += entry.arbitratorMu.quota
					if underReclaimBytes >= remainBytes {
						return
					}
				}
			}
		}
	}
}

func (m *MemArbitrator) allocateFromPrivilegedBudget(target *rootPoolEntry, remainBytes int64) (bool, int64) {
	ok := false
	if m.privilegedEntry == target {
		ok = true
	} else if m.privilegedEntry == nil && target.ctx.preferPrivilege {
		if target.intoExecPrivileged() {
			m.privilegedEntry = target
			ok = true
		} else {
			ok = false
		}
	}

	if !ok {
		return false, 0
	}

	m.alloc(remainBytes)

	return ok, remainBytes
}

func (m *MemArbitrator) ableToGC() bool {
	return m.mu.released-m.mu.lastGC >= uint64(m.poolAllocStats.SmallPoolLimit)
}

func (m *MemArbitrator) tryRuntimeGC() bool {
	if m.ableToGC() {
		m.updateTrackedHeapStats()
		m.reclaimHeap()
		return true
	}
	return false
}

// reserved buffer for arbitrate process
func (m *MemArbitrator) reservedBuffer() int64 {
	if m.execMu.mode == ArbitratorModePriority {
		return m.buffer.size.Load()
	}
	return 0
}

func (m *MemArbitrator) arbitrate(target *rootPoolEntry) (bool, int64) {
	reclaimedBytes := int64(0)
	remainBytes := target.request.quota

	onlyPrivilegedBudget := false
	for m.heapController.heapAlloc.Load() > m.limit()-m.reservedBuffer()-remainBytes {
		if !m.tryRuntimeGC() {
			onlyPrivilegedBudget = true // only could alloc from the privileged budget
			break
		}
	}

	{
		ok := false
		reclaimed := int64(0)
		if m.execMu.mode == ArbitratorModePriority {
			ok, reclaimed = m.allocateFromPrivilegedBudget(target, remainBytes)
			reclaimedBytes += reclaimed
			remainBytes -= reclaimed
		}
		if ok {
			return true, reclaimedBytes
		} else if onlyPrivilegedBudget {
			return false, reclaimedBytes
		}
	}

	for {
		ok, reclaimed := m.allocateFromArbitrator(remainBytes, m.reservedBuffer()+m.avoidance.size.Load())
		reclaimedBytes += reclaimed
		remainBytes -= reclaimed
		if ok {
			return true, reclaimedBytes
		}
		if !m.tryRuntimeGC() {
			break
		}
	}

	return false, reclaimedBytes
}

// NewMemArbitrator creates a new mem-arbitrator heap instance
func NewMemArbitrator(limit int64, shardNum uint64, maxQuotaShardNum int, minQuotaForReclaim int64, recorder RecordMemState) *MemArbitrator {
	if limit <= 0 {
		limit = DefMaxLimit
	}
	m := &MemArbitrator{
		mode: ArbitratorModeDisable,
	}
	m.tasks.fifoTasks.init()
	for i := range m.tasks.fifoByPriority {
		m.tasks.fifoByPriority[i].init()
	}
	shardNum = nextPow2(shardNum)
	m.tasks.fifoWaitAverse.init()
	m.notifer = NewNotifer()
	m.entryMap.init(shardNum, maxQuotaShardNum, minQuotaForReclaim)
	m.doSetLimit(limit)
	m.resetStatistics()
	m.setMinHeapFreeBPS(defMinHeapFreeBPS)
	m.cleanupMu.fifoTasks.init()
	m.underKill.init()
	m.underCancel.init()
	{
		f := func(string, ...zap.Field) {}
		m.actions.Info = f
		m.actions.Warn = f
		m.actions.Error = f
	}
	{
		m.heapController.memStateRecorder.Lock()

		m.heapController.memStateRecorder.RecordMemState = recorder
		if s, err := recorder.Load(); err == nil && s != nil {
			m.heapController.memStateRecorder.lastMemState.Store(s)
			m.doSetMemMagnif(s.Magnif)
			m.poolAllocStats.mediumQuota.Store(s.PoolMediumCap)
		}

		m.heapController.memStateRecorder.Unlock()
	}
	m.resetDigestProfileCache(shardNum)
	return m
}

func (m *MemArbitrator) resetDigestProfileCache(shardNum uint64) {
	m.digestProfileCache.shards = make([]digestProfileShard, shardNum)
	m.digestProfileCache.shardsMask = shardNum - 1
	m.digestProfileCache.num.Store(0)
	m.digestProfileCache.limit = defMaxDigestProfileCacheLimit
}

// SetDigestProfileCacheLimit sets the limit of the digest profile cache
func (m *MemArbitrator) SetDigestProfileCacheLimit(limit int64) {
	m.digestProfileCache.limit = min(max(0, limit), defMax)
}

func (m *MemArbitrator) doCancelPendingTasks(prio ArbitrationPriority, waitAverse bool) (cnt int64) {
	var entries [64]*rootPoolEntry

	fifo := &m.tasks.fifoWaitAverse
	reason := ArbitratorWaitAverseCancel
	if !waitAverse {
		fifo = &m.tasks.fifoByPriority[prio]
		reason = ArbitratorStandardCancel
	}

	for {
		size := 0
		{
			m.tasks.Lock()

			for {
				entry := fifo.front()
				if entry == nil {
					break
				}
				if m.removeTaskImpl(entry) {
					entries[size] = entry
					size++
				}
				if size == len(entries) {
					break
				}
			}

			m.tasks.Unlock()
		}

		for i := range size {
			entry := entries[i]
			if ctx := entry.ctx.Load(); ctx.available() {
				ctx.stop(reason)
			}
			entry.windUp(0, ArbitrateFail)
		}

		cnt += int64(size)

		if size != len(entries) {
			break
		}
	}

	return cnt
}

func (m *MemArbitrator) doExecuteFirstTask() (exec bool) {
	if m.tasks.fifoTasks.approxEmpty() {
		return
	}

	entry := m.extractFirstTaskEntry()

	if entry == nil {
		return
	}

	if entry.arbitratorMu.destroyed {
		if m.removeTask(entry) {
			entry.windUp(0, ArbitrateFail)
		}
		return true
	}

	{
		ok, reclaimedBytes := m.arbitrate(entry)

		if ok {
			exec = true

			if m.removeTask(entry) {
				if m.execMu.mode == ArbitratorModePriority {
					m.execMetrics.Task.SuccByPriority[entry.taskMu.fifoByPriority.priority]++
				}
				m.entryMap.addQuota(entry, reclaimedBytes)
				// wind up & publish the result
				entry.windUp(reclaimedBytes, ArbitrateOk)
			} else {
				// subscription task may have been canceled
				m.release(reclaimedBytes)
			}
		} else {
			m.release(reclaimedBytes)
			m.updateBlockedAt()
			m.doReclaimByWorkMode(entry, reclaimedBytes)
		}
	}

	return
}

func (m *MemArbitrator) doReclaimNonBlockingTasks() {
	if m.execMu.mode == ArbitratorModeStandard {
		for prio := minArbitrationPriority; prio < maxArbitrationPriority; prio++ {
			if m.taskNumByPriority(prio) != 0 {
				m.execMetrics.Cancel.StandardMode += m.doCancelPendingTasks(prio, false)
			}
		}
	} else if m.taskNumOfWaitAverse() != 0 {
		m.execMetrics.Cancel.WaitAverse += m.doCancelPendingTasks(maxArbitrationPriority, true)
	}
}

func (m *MemArbitrator) doReclaimByWorkMode(entry *rootPoolEntry, reclaimedBytes int64) {
	waitAverse := entry.ctx.waitAverse
	m.doReclaimNonBlockingTasks()
	// entry's ctx may have been modified
	if waitAverse {
		return
	}
	if m.execMu.mode == ArbitratorModePriority {
		m.doReclaimMemByPriority(entry, entry.request.quota-reclaimedBytes)
	}
}

func (m *MemArbitrator) doExecuteCleanupTasks() {
	for {
		var entry *rootPoolEntry
		{
			m.cleanupMu.Lock()

			entry = m.cleanupMu.fifoTasks.popFront()

			m.cleanupMu.Unlock()
		}
		if entry == nil {
			break
		}

		if m.privilegedEntry == entry {
			m.privilegedEntry = nil
		}
		m.deleteUnderCancel(entry)
		m.deleteUnderKill(entry)

		if !entry.stateMu.stop.Load() { // reset pool entry
			toRelease := entry.stateMu.quotaToReclaim.Swap(0)
			if toRelease > 0 {
				m.release(toRelease)
				atomic.AddUint64(&m.mu.released, uint64(toRelease))
				m.entryMap.addQuota(entry, -toRelease)
			}
		} else {
			if !entry.arbitratorMu.destroyed {
				if entry.arbitratorMu.quota > 0 {
					m.release(entry.arbitratorMu.quota)
					atomic.AddUint64(&m.mu.released, uint64(entry.arbitratorMu.quota))
				}
				m.entryMap.delete(entry)
				m.rootPoolNum.Add(-1)
				entry.arbitratorMu.destroyed = true
			}

			if m.removeTask(entry) {
				entry.windUp(0, ArbitrateFail)
			}
		}
	}
}

func (m *MemArbitrator) implicitRun() { // satisfy any subscription task
	if m.tasks.fifoTasks.approxEmpty() {
		return
	}

	for { // make all tasks success
		entry := m.frontTaskEntry()
		if entry == nil {
			break
		}

		if entry.arbitratorMu.destroyed {
			if m.removeTask(entry) {
				entry.windUp(0, ArbitrateFail)
			}
			continue
		}

		if m.removeTask(entry) {
			m.alloc(entry.request.quota)
			m.entryMap.addQuota(entry, entry.request.quota)
			entry.windUp(entry.request.quota, ArbitrateOk)
		}
	}
}

// -1: at ArbitratorModeDisable
// -2: mem unsafe
// >= 0: execute / cancel task num
func (m *MemArbitrator) runOneRound() (taskExecNum int) {
	m.execMu.startTime = now()
	if t := m.execMu.startTime.Unix(); t != m.approxUnixTimeSec() { // update per second duration and reduce force sharing
		m.setUnixTimeSec(t)
	}

	if mode := m.workMode(); m.execMu.mode != mode {
		m.execMu.mode = mode
		if mode == ArbitratorModeDisable { // switch to disable mode
			m.setMemSafe()
			m.execMu.blockedState.reset()
		}
	}

	if !m.cleanupMu.fifoTasks.approxEmpty() {
		m.doExecuteCleanupTasks()
	}

	if m.execMu.mode == ArbitratorModeDisable {
		m.implicitRun()
		return -1
	}

	if !m.handleMemIssues() { // mem is still unsafe
		return -2
	}

	for m.doExecuteFirstTask() {
		taskExecNum++
	}

	return taskExecNum
}

func (m *MemArbitrator) asyncRun(duration time.Duration) bool {
	if m.controlMu.running.Load() {
		return false
	}
	m.controlMu.running.Store(true)
	m.controlMu.finishCh = make(chan struct{})

	go func() {
		ticker := time.NewTicker(duration)
		for m.controlMu.running.Load() {
			select {
			case <-ticker.C:
				m.weakWake()
			case <-m.notifer.C:
				m.notifer.clear()
				m.runOneRound()
			}
		}

		ticker.Stop()
		close(m.controlMu.finishCh)
	}()
	return true
}

// Restart starts the root pool with the given context
func (r *rootPool) Restart(ctx *ArbitrationContext) bool {
	return r.arbitrator.restartEntryByContext(r.entry, ctx)
}

// Pool returns the internal resource pool
func (r *rootPool) Pool() *ResourcePool {
	return r.entry.pool
}

func (m *MemArbitrator) restartEntryByContext(entry *rootPoolEntry, ctx *ArbitrationContext) bool {
	if entry == nil {
		return false
	}
	entry.stateMu.Lock()
	defer entry.stateMu.Unlock()

	if entry.stateMu.stop.Load() || entry.execState() != execStateIdle {
		return false
	}

	entry.pool.mu.Lock()
	defer entry.pool.mu.Unlock()

	if ctx != nil {
		if ctx.PrevMaxMem > m.buffer.size.Load() {
			m.setBufferSize(ctx.PrevMaxMem)
		} else if ctx.memQuotaLimit > m.buffer.quotaLimit.Load() {
			m.setBufferSize(ctx.memQuotaLimit)
			m.setQuotaLimit(ctx.memQuotaLimit)
		}

		if ctx.waitAverse {
			entry.ctx.preferPrivilege = false
			entry.ctx.memPriority = ArbitrationPriorityHigh
		} else {
			entry.ctx.preferPrivilege = ctx.preferPrivilege
			entry.ctx.memPriority = ctx.memPriority
		}

		entry.ctx.cancelCh = ctx.cancelCh
		entry.ctx.waitAverse = ctx.waitAverse
	} else {
		entry.ctx.cancelCh = nil
		entry.ctx.waitAverse = false
		entry.ctx.memPriority = ArbitrationPriorityMedium
		entry.ctx.preferPrivilege = false
	}

	entry.ctx.Store(ctx)

	if _, loaded := m.entryMap.contextCache.LoadOrStore(entry.pool.uid, entry); !loaded {
		m.entryMap.contextCache.num.Add(1)
	}

	if entry.pool.actions.OutOfCapacityActionCB == nil {
		entry.pool.actions.OutOfCapacityActionCB = func(s OutOfCapacityActionArgs) error {
			if m.blockingAllocate(entry, s.Request) != ArbitrateOk {
				return errArbitrateFailError
			}
			return nil
		}
	}
	entry.pool.mu.stopped = false

	entry.setExecState(execStateRunning)

	return true
}

// DebugFields is used to store debug fields for logging
type DebugFields struct {
	fields [30]zap.Field
	n      int
}

// ConcurrentBudget represents a wrapped budget of the resource pool for concurrent usage
type ConcurrentBudget struct {
	Pool            *ResourcePool
	Capacity        int64
	LastUsedTimeSec int64
	sync.Mutex
	_    cpuCacheLinePad
	Used atomic.Int64
	_    cpuCacheLinePad
}

//go:norace
func (b *ConcurrentBudget) setLastUsedTimeSec(t int64) {
	b.LastUsedTimeSec = t
}

//go:norace
func (b *ConcurrentBudget) approxCapacity() int64 {
	return b.Capacity
}

func (b *ConcurrentBudget) getLastUsedTimeSec() int64 {
	return b.LastUsedTimeSec
}

// TrackedConcurrentBudget consists of ConcurrentBudget and heap inuse
type TrackedConcurrentBudget struct {
	ConcurrentBudget
	HeapInuse atomic.Int64
	_         cpuCacheLinePad
}

// Clear clears the concurrent budget and returns the capacity
func (b *ConcurrentBudget) Clear() int64 {
	b.Lock()

	budgetCap := b.Capacity
	b.Capacity = 0
	b.Used.Store(0)
	if budgetCap > 0 {
		b.Pool.release(budgetCap)
	}
	b.Pool = nil

	b.Unlock()
	return budgetCap
}

// Reserve reserves a given capacity for the concurrent budget
func (b *ConcurrentBudget) Reserve(newCap int64) (err error) {
	b.Lock()

	extra := max(newCap, b.Used.Load(), b.Capacity) - b.Capacity
	if err = b.Pool.allocate(extra); err == nil {
		b.Capacity += extra
	}

	b.Unlock()
	return
}

// PullFromUpstream tries to pull from the upstream pool when facing `out of capacity`
// It requires the action of the pool to be non-blocking
func (b *ConcurrentBudget) PullFromUpstream() (err error) {
	b.Lock()

	delta := b.Used.Load() - b.Capacity
	if delta > 0 {
		delta = b.Pool.roundSize(delta)
		if err = b.Pool.allocate(delta); err == nil {
			b.Capacity += delta
		}
	}

	b.Unlock()
	return
}

// AutoRun starts the work groutine of the mem-arbitrator asynchronously
func (m *MemArbitrator) AutoRun(
	actions MemArbitratorActions,
	awaitFreePoolAllocAlignSize, awaitFreePoolShardNum int64,
	taskTickDur time.Duration,
) bool {
	m.controlMu.Lock()
	defer m.controlMu.Unlock()

	if m.controlMu.running.Load() {
		return false
	}

	{ // init
		m.actions = actions
		m.refreshRuntimeMemStats()
		m.initAwaitFreePool(awaitFreePoolAllocAlignSize, awaitFreePoolShardNum)
	}
	return m.asyncRun(taskTickDur)
}

func (m *MemArbitrator) refreshRuntimeMemStats() {
	if m.actions.UpdateRuntimeMemStats != nil {
		m.actions.UpdateRuntimeMemStats() // should invoke `SetRuntimeMemStats`
	}
	atomic.AddInt64(&m.execMetrics.Action.UpdateRuntimeMemStats, 1)
}

// RuntimeMemStats represents the runtime memory statistics
type RuntimeMemStats struct {
	HeapAlloc, HeapInuse, TotalFree, MemOffHeap, LastGC int64
}

func (m *MemArbitrator) trySetRuntimeMemStats(s RuntimeMemStats) bool {
	if m.heapController.TryLock() {
		m.doSetRuntimeMemStats(s)
		m.heapController.Unlock()
		return true
	}
	return false
}

// SetRuntimeMemStats sets the runtime memory statistics. It may be invoked by `refreshRuntimeMemStats` -> `actions.UpdateRuntimeMemStats`
func (m *MemArbitrator) SetRuntimeMemStats(s RuntimeMemStats) {
	m.heapController.Lock()
	m.doSetRuntimeMemStats(s)
	m.heapController.Unlock()
}

func (m *MemArbitrator) doSetRuntimeMemStats(s RuntimeMemStats) {
	m.heapController.heapAlloc.Store(s.HeapAlloc)
	m.heapController.heapInuse.Store(s.HeapInuse)
	m.heapController.heapTotalFree.Store(s.TotalFree)
	m.heapController.memOffHeap.Store(s.MemOffHeap)
	m.heapController.memInuse.Store(s.MemOffHeap + s.HeapInuse)

	if s.LastGC > m.heapController.lastGC.utime.Load() {
		m.heapController.lastGC.heapAlloc.Store(s.HeapAlloc)
		m.heapController.lastGC.utime.Store(s.LastGC)
	}

	m.updateAvoidSize() // calc out-of-control & avoid size
}

func (m *MemArbitrator) updateAvoidSize() {
	capacity := m.softLimit()
	if m.mu.softLimit.mode == SoftLimitModeAuto {
		if ratio := m.memMagnif(); ratio != 0 {
			newCap := calcRatio(m.limit(), ratio)
			capacity = min(capacity, newCap)
		}
	}
	avoidSize := max(
		0,
		m.heapController.heapAlloc.Load()+m.heapController.memOffHeap.Load()-m.avoidance.heapTracked.Load(), // out-of-control size
		m.limit()-capacity,
	)
	m.avoidance.size.Store(avoidSize)

	if delta := m.allocated() - m.limit() + avoidSize; delta > 0 && m.awaitFree.pool.allocated() > 0 {
		reclaimed := int64(0)
		poolReleased := int64(0)
		for i := range len(m.awaitFree.budget.shards) {
			idx := (m.avoidance.awaitFreeBudgetKickOutIdx + uint64(i) + 1) & m.awaitFree.budget.sizeMask
			b := &m.awaitFree.budget.shards[idx]
			if b.approxCapacity() > 0 && b.TryLock() {
				x := min(delta-reclaimed, b.Capacity)
				b.Capacity -= x
				reclaimed += x
				b.Unlock()
			}

			if reclaimed >= delta {
				m.avoidance.awaitFreeBudgetKickOutIdx = idx
				break
			}
		}
		if reclaimed > 0 {
			m.awaitFree.pool.mu.Lock()

			m.awaitFree.pool.doRelease(reclaimed) // release even if `reclaimed` is 0
			poolReleased = m.awaitFree.pool.mu.budget.release()

			m.awaitFree.pool.mu.Unlock()
		}
		if poolReleased > 0 {
			m.release(poolReleased)
			atomic.AddInt64(&m.execMetrics.AwaitFree.ForceShrink, 1)
			atomic.AddUint64(&m.mu.released, uint64(poolReleased))
		}
	}
}

func (m *MemArbitrator) weakWake() {
	m.notifer.WeakWake()
}

func (m *MemArbitrator) wake() {
	m.notifer.Wake()
}

func (m *MemArbitrator) updatePoolMediumCapacity(utimeMilli int64) {
	s := &m.poolAllocStats
	const maxNum = int64(len(s.timedMap))
	const maxDur = maxNum - defRedundancy

	{
		s.RLock()

		tsAlign := utimeMilli / kilo / defUpdateMemConsumedTimeAlignSec
		tar1 := &s.timedMap[(maxNum+tsAlign-1)%maxNum]
		tar2 := &s.timedMap[tsAlign%maxNum]

		if ts := tar1.tsAlign.Load(); ts <= tsAlign-maxDur || ts > tsAlign {
			tar1 = nil
		}
		if ts := tar2.tsAlign.Load(); ts <= tsAlign-maxDur || ts > tsAlign {
			tar2 = nil
		}

		total := uint64(0)
		if tar1 != nil {
			tar1.RLock()
			total += tar1.num.Load()
		}
		if tar2 != nil {
			tar2.RLock()
			total += tar2.num.Load()
		}

		if total != 0 {
			expect := max(1, (total+1)/2)
			cnt := uint64(0)
			index := 0

			for i := range defServerlimitMinUnitNum {
				if tar1 != nil {
					cnt += uint64(tar1.slot[i])
				}
				if tar2 != nil {
					cnt += uint64(tar2.slot[i])
				}
				if cnt >= expect {
					index = i
					break
				}
			}

			res := s.PoolAllocUnit * int64(index+1)

			s.mediumQuota.Store(res)
		}

		if tar1 != nil {
			tar1.RUnlock()
		}
		if tar2 != nil {
			tar2.RUnlock()
		}

		s.RUnlock()
	}

	m.tryStorePoolMediumCapacity(utimeMilli, m.poolMediumQuota())
}

func (m *MemArbitrator) tryStorePoolMediumCapacity(utimeMilli int64, capacity int64) bool {
	if capacity == 0 {
		return false
	}
	if lastState := m.lastMemState(); lastState == nil ||
		(m.poolAllocStats.lastUpdateUtimeMilli.Load()+defStorePoolMediumCapDurMilli <= utimeMilli &&
			lastState.PoolMediumCap != capacity) {
		var memState *RuntimeMemStateV1

		if lastState != nil {
			s := *lastState // copy
			s.PoolMediumCap = capacity
			memState = &s
		} else {
			memState = &RuntimeMemStateV1{
				Version:       1,
				PoolMediumCap: capacity,
			}
		}

		_ = m.recordMemState(memState, "new root pool medium cap")
		m.poolAllocStats.lastUpdateUtimeMilli.Store(utimeMilli)
		return true
	}
	return false
}

func (m *MemArbitrator) poolMediumQuota() int64 {
	return m.poolAllocStats.mediumQuota.Load()
}

// SuggestPoolInitCap returns the suggested initial capacity for the pool
func (m *MemArbitrator) SuggestPoolInitCap() int64 {
	return m.poolMediumQuota()
}

func (m *MemArbitrator) updateMemMagnification(utimeMilli int64) (updatedPreProf *memProfile) {
	const maxNum = int64(len(m.heapController.timedMemProfile))

	curTsAlign := utimeMilli / kilo / defUpdateMemMagnifUtimeAlign
	profs := &m.heapController.timedMemProfile
	cur := &profs[curTsAlign%maxNum]
	if cur.tsAlign < curTsAlign {
		{ // update previous record
			preTs := curTsAlign - 1
			preIdx := (maxNum + preTs) % maxNum
			pre := &profs[preIdx]

			pre.ratio = 0
			if pre.tsAlign == preTs && pre.quota > 0 {
				if pre.heap > 0 {
					pre.ratio = calcRatio(pre.heap, pre.quota)
				}
				updatedPreProf = pre
			}
		}

		v := int64(0)
		// check the memory profile in the last 60s
		for _, tsAlign := range []int64{curTsAlign - 2, curTsAlign - 1} {
			tar := &profs[(maxNum+tsAlign)%maxNum]
			if tar.tsAlign != tsAlign ||
				tar.heap >= m.oomRisk() { // calculate the magnification only when the heap is safe
				v = 0
				break
			}

			if tar.ratio <= 0 {
				break // if any record is not valid,
			}
			v = max(v, tar.ratio)
		}

		updated := false
		var oriRatio, newRatio int64

		if v != 0 && m.avoidance.memMagnif.TryLock() {
			if oriRatio = m.memMagnif(); oriRatio != 0 && v < oriRatio-10 /* 1 percent */ {
				newRatio = (oriRatio + v) / 2
				if newRatio <= kilo {
					newRatio = 0
				}
				m.doSetMemMagnif(newRatio)
				updated = true
			}
			m.avoidance.memMagnif.Unlock()
		}

		if updated {
			m.actions.Info("Update mem quota magnification ratio",
				zap.Int64("ori-ratio(‰)", oriRatio),
				zap.Int64("new-ratio(‰)", newRatio),
			)

			if lastMemState := m.lastMemState(); lastMemState != nil && newRatio < lastMemState.Magnif {
				memState := RuntimeMemStateV1{
					Version:       1,
					Magnif:        newRatio,
					PoolMediumCap: m.poolMediumQuota(),
				}
				_ = m.recordMemState(&memState, "new magnification ratio")
			}
		}

		*cur = memProfile{
			tsAlign:         curTsAlign,
			startUtimeMilli: utimeMilli,
		}
	}

	if cur.tsAlign == curTsAlign {
		if ut := m.heapController.lastGC.utime.Load(); curTsAlign == ut/1e9/defUpdateMemMagnifUtimeAlign {
			cur.heap = max(cur.heap, m.heapController.lastGC.heapAlloc.Load())
		}
		if blockedSize, utimeSec := m.lastBlockedAt(); utimeSec/defUpdateMemMagnifUtimeAlign == curTsAlign {
			cur.quota = max(cur.quota, blockedSize)
		}
	}
	return
}

func (m *MemArbitrator) doSetMemMagnif(ratio int64) {
	m.avoidance.memMagnif.ratio.Store(ratio)
}

func (m *MemArbitrator) memMagnif() int64 {
	return m.avoidance.memMagnif.ratio.Load()
}

//go:norace
func (m *MemArbitrator) awaitFreePoolCap() int64 {
	if m.awaitFree.pool == nil {
		return 0
	}
	return m.awaitFree.pool.capacity()
}

type memPoolQuotaUsage struct{ trackedHeap, quota int64 }

//go:norace
func (m *MemArbitrator) awaitFreePoolUsed() (res memPoolQuotaUsage) {
	for i := range m.awaitFree.budget.shards {
		if d := m.awaitFree.budget.shards[i].Used.Load(); d > 0 {
			res.quota += d
		}
		if d := m.awaitFree.budget.shards[i].HeapInuse.Load(); d > 0 {
			res.trackedHeap += d
		}
	}
	m.awaitFree.lastQuotaUsage = res
	return
}

//go:norace
func (m *MemArbitrator) approxAwaitFreePoolUsed() memPoolQuotaUsage {
	return m.awaitFree.lastQuotaUsage
}

func (m *MemArbitrator) executeTick(utimeMilli int64) bool { // exec batch tasks every 1s
	if m.atMemRisk() { // skip if oom check is running because mem state is not safe
		return false
	}

	if m.tickTask.lastTickUtimeMilli.Load()+defTickDurMilli > utimeMilli {
		return false
	}
	m.tickTask.Lock()
	defer m.tickTask.Unlock()

	m.tickTask.lastTickUtimeMilli.Store(utimeMilli)

	// mem magnification
	if updatedPreProf := m.updateMemMagnification(utimeMilli); updatedPreProf != nil {
		pre := updatedPreProf
		profile := m.recordDebugProfile()
		profile.append(
			zap.Int64("last-blocked-heap", pre.heap), zap.Int64("last-blocked-quota", pre.quota),
			zap.Int64("last-magnification-ratio(‰)", pre.ratio),
			zap.Time("last-prof-start-time", time.UnixMilli(pre.startUtimeMilli)),
		)
		m.actions.Info("Mem profile timeline",
			profile.fields[:profile.n]...,
		)
	}
	// suggest pool cap
	m.updatePoolMediumCapacity(utimeMilli)
	// shrink mem profile cache
	m.shrinkDigestProfile(utimeMilli/kilo, m.digestProfileCache.limit, m.digestProfileCache.limit/2)
	return true
}

func (d *DebugFields) append(f ...zap.Field) {
	n := min(len(f), len(d.fields)-d.n)
	for i := range n {
		d.fields[d.n] = f[i]
		d.n++
	}
}

func (m *MemArbitrator) recordDebugProfile() (f DebugFields) {
	taskNumByMode := m.TaskNumByPattern()
	memMagnif := m.memMagnif()
	if memMagnif == 0 {
		memMagnif = -1
	}
	f.append(
		zap.Int64("heap-inuse", m.heapController.heapInuse.Load()),
		zap.Int64("heap-alloc", m.heapController.heapAlloc.Load()),
		zap.Int64("mem-off-heap", m.heapController.memOffHeap.Load()),
		zap.Int64("mem-inuse", m.heapController.memInuse.Load()),
		zap.Int64("hard-limit", m.limit()),
		zap.Int64("quota-allocated", m.allocated()),
		zap.Int64("quota-softlimit", m.softLimit()),
		zap.Int64("mem-magnification-ratio(‰)", memMagnif),
		zap.Int64("root-pool-num", m.RootPoolNum()),
		zap.Int64("awaitfree-pool-cap", m.awaitFreePoolCap()),
		zap.Int64("awaitfree-pool-used", m.approxAwaitFreePoolUsed().quota),
		zap.Int64("awaitfree-pool-heapinuse", m.approxAwaitFreePoolUsed().trackedHeap),
		zap.Int64("tracked-heapinuse", m.avoidance.heapTracked.Load()),
		zap.Int64("out-of-control", m.avoidance.size.Load()),
		zap.Int64("buffer", m.buffer.size.Load()),
		zap.Int64("task-num", m.TaskNum()),
		zap.Int64("task-priority-low", taskNumByMode[ArbitrationPriorityLow]),
		zap.Int64("task-priority-medium", taskNumByMode[ArbitrationPriorityMedium]),
		zap.Int64("task-priority-high", taskNumByMode[ArbitrationPriorityHigh]),
		zap.Int64("pending-alloc-size", m.WaitingAllocSize()),
		zap.Int64("digest-cache-num", m.digestProfileCache.num.Load()),
	)
	if memRisk := m.heapController.memRisk.startTime.nano.Load(); memRisk != 0 {
		f.append(zap.Time("mem-risk-start", time.Unix(0, memRisk)))
	}
	return
}

// HandleRuntimeStats handles the runtime memory statistics
func (m *MemArbitrator) HandleRuntimeStats(s RuntimeMemStats) {
	// shrink fast alloc pool
	m.tryShrinkAwaitFreePool(defPoolReservedQuota, nowUnixMilli())
	// update tracked mem stats
	m.tryUpdateTrackedMemStats(nowUnixMilli())
	// set runtime mem stats & update avoidance size
	m.trySetRuntimeMemStats(s)
	m.executeTick(nowUnixMilli())
	m.weakWake()
}

func (m *MemArbitrator) tryUpdateTrackedMemStats(utimeMilli int64) bool {
	if m.avoidance.heapTracked.lastUpdateUtimeMilli.Load()+defTrackMemStatsDurMilli <= utimeMilli {
		m.updateTrackedHeapStats()
		return true
	}
	return false
}

func (m *MemArbitrator) updateTrackedHeapStats() {
	totalTrackedHeap := int64(0)
	if m.entryMap.contextCache.num.Load() != 0 {
		maxMemUsed := int64(0)
		m.entryMap.contextCache.Range(func(_, value any) bool {
			e := value.(*rootPoolEntry)
			if e.notRunning() {
				return true
			}
			if ctx := e.ctx.Load(); ctx.available() {
				if memUsed := ctx.arbitrateHelper.HeapInuse(); memUsed > 0 {
					totalTrackedHeap += memUsed
					maxMemUsed = max(maxMemUsed, memUsed)
				}
			}
			return true
		})
		if m.buffer.size.Load() < maxMemUsed {
			m.tryToUpdateBuffer(maxMemUsed, 0, m.approxUnixTimeSec())
		}
	}

	totalTrackedHeap += m.awaitFreePoolUsed().trackedHeap
	m.avoidance.heapTracked.Store(totalTrackedHeap)
	m.avoidance.heapTracked.lastUpdateUtimeMilli.Store(nowUnixMilli())
}

func (m *MemArbitrator) tryShrinkAwaitFreePool(minRemain int64, utimeMilli int64) bool {
	if m.awaitFree.lastShrinkUtimeMilli.Load()+defAwaitFreePoolShrinkDurMilli <= utimeMilli {
		m.shrinkAwaitFreePool(minRemain, utimeMilli)
		return true
	}
	return false
}

func (m *MemArbitrator) shrinkAwaitFreePool(minRemain int64, utimeMilli int64) {
	if m.awaitFree.pool.allocated() <= 0 {
		return
	}

	poolReleased := int64(0)
	reclaimed := int64(0)

	for i := range m.awaitFree.budget.shards {
		b := &m.awaitFree.budget.shards[i]

		if used := b.Used.Load(); used > 0 {
			if b.approxCapacity()-(used+minRemain) >= b.Pool.allocAlignSize && b.TryLock() {
				if used = b.Used.Load(); used > 0 {
					toReclaim := b.Capacity - (used + minRemain)

					if toReclaim >= b.Pool.allocAlignSize {
						b.Capacity -= toReclaim
						reclaimed += toReclaim
					}
				}
				b.Unlock()
			}
		} else {
			if b.approxCapacity() > 0 && b.getLastUsedTimeSec()*kilo+defAwaitFreePoolShrinkDurMilli <= utimeMilli && b.TryLock() {
				if toReclaim := b.Capacity; b.Used.Load() <= 0 && toReclaim > 0 {
					b.Capacity -= toReclaim
					reclaimed += toReclaim
				}
				b.Unlock()
			}
		}
	}

	if reclaimed > 0 {
		m.awaitFree.pool.mu.Lock()

		m.awaitFree.pool.doRelease(reclaimed)
		poolReleased = m.awaitFree.pool.mu.budget.release()

		m.awaitFree.pool.mu.Unlock()
	}
	if poolReleased > 0 {
		m.release(poolReleased)
		atomic.AddUint64(&m.mu.released, uint64(poolReleased))
		atomic.AddInt64(&m.execMetrics.AwaitFree.Shrink, 1)
		m.weakWake()
	}

	m.awaitFree.lastShrinkUtimeMilli.Store(utimeMilli)
}

func (m *MemArbitrator) isMemSafe() bool {
	return m.heapController.memInuse.Load() < m.oomRisk()
}

func (m *MemArbitrator) isMemNoRisk() bool {
	return m.heapController.memInuse.Load() < m.memRisk()
}

func (m *MemArbitrator) calcMemRisk() *RuntimeMemStateV1 {
	if m.mu.softLimit.mode != SoftLimitModeAuto {
		return nil
	}

	memState := RuntimeMemStateV1{
		Version: 1,
		LastRisk: LastRisk{
			HeapAlloc:  m.heapController.heapAlloc.Load(),
			QuotaAlloc: m.allocated(),
		},

		PoolMediumCap: m.poolMediumQuota(),
	}

	if memState.LastRisk.QuotaAlloc == 0 || memState.LastRisk.HeapAlloc <= memState.LastRisk.QuotaAlloc {
		return nil
	}
	memState.Magnif = calcRatio(memState.LastRisk.HeapAlloc, memState.LastRisk.QuotaAlloc) + 100 /* 10 percent */
	if p := m.lastMemState(); p != nil {
		memState.Magnif = max(memState.Magnif, p.Magnif)
	}

	return &memState
}

// return `true` is memory state is safe
func (m *MemArbitrator) handleMemIssues() (isSafe bool) {
	if m.atMemRisk() {
		gcExecuted := m.tryRuntimeGC()
		if !gcExecuted {
			m.refreshRuntimeMemStats()
		}

		if m.isMemNoRisk() {
			m.updateTrackedHeapStats()
			m.updateAvoidSize() // no need to refresh runtime mem stats

			{ // warning
				profile := m.recordDebugProfile()
				m.actions.Info("Memory is safe", profile.fields[:profile.n]...)
			}
			m.setMemSafe()
			return true
		}

		m.doReclaimNonBlockingTasks()
		m.handleMemRisk(gcExecuted)
		return false
	} else if !m.isMemSafe() {
		m.doReclaimNonBlockingTasks()
		m.intoMemRisk()
		return false
	}
	return true
}

func (m *MemArbitrator) innerTime() time.Time {
	if m.debug.now != nil {
		return m.debug.now()
	}
	return now()
}

func (m *MemArbitrator) handleMemRisk(gcExecuted bool) {
	now := m.innerTime()
	oomRisk := m.heapController.memInuse.Load() > m.limit()
	dur := now.Sub(m.heapController.memRisk.lastMemStats.startTime)
	if !oomRisk && dur < defHeapReclaimCheckDuration {
		return
	}
	heapUseBPS := int64(0)

	if dur > 0 {
		heapFrees := m.heapController.heapTotalFree.Load() - m.heapController.memRisk.lastMemStats.heapTotalFree
		heapUseBPS = int64(float64(heapFrees) / dur.Seconds())
	}
	if oomRisk || memHangRisk(heapUseBPS, m.minHeapFreeBPS(), now, m.heapController.memRisk.startTime.t) {
		m.intoOOMRisk()

		memToReclaim := m.heapController.memInuse.Load() - m.memRisk()

		{ // warning
			profile := m.recordDebugProfile()
			profile.append(
				zap.Float64("heap-use-speed(MiB/s)", float64(heapUseBPS*100/byteSizeMB)/100),
				zap.Float64("required-speed(MiB/s)", float64(m.minHeapFreeBPS())/float64(byteSizeMB)),
				zap.Int64("quota-to-reclaim", max(0, memToReclaim)),
			)
			m.actions.Warn("`OOM RISK`: try to `KILL` running root pool", profile.fields[:profile.n]...)
		}

		if newKillNum, reclaiming := m.killTopnEntry(memToReclaim); newKillNum != 0 {
			m.heapController.memRisk.startTime.t = m.innerTime() // restart oom check
			m.heapController.memRisk.startTime.nano.Store(m.heapController.memRisk.startTime.t.UnixNano())

			{ // warning
				profile := m.recordDebugProfile()
				profile.append(
					zap.Int64("pool-under-kill-num", m.underKill.num),
					zap.Int("new-kill-num", newKillNum),
					zap.Int64("quota-under-reclaim", reclaiming),
					zap.Int64("rest-quota-to-reclaim", max(0, memToReclaim-reclaiming)),
				)
				m.actions.Warn("Restart runtime memory check", profile.fields[:profile.n]...)
			}
		} else {
			underKillNum := 0
			for _, entry := range m.underKill.entries {
				if !entry.arbitratorMu.underKill.fail {
					underKillNum++
				}
			}
			if underKillNum == 0 {
				forceKill := 0
				for { // make all tasks success
					entry := m.frontTaskEntry()
					if entry == nil {
						break
					}
					// force kill
					if ctx := entry.ctx.Load(); ctx.available() {
						ctx.stop(ArbitratorOOMRiskKill)
						m.execMetrics.Risk.OOMKill[entry.ctx.memPriority]++
						forceKill++
						if m.removeTask(entry) {
							entry.windUp(0, ArbitrateFail)
						}
					}
				}
				if forceKill != 0 {
					profile := m.recordDebugProfile()
					profile.append(
						zap.Int("kill-awaiting-num", forceKill),
						zap.Int64("pool-under-kill-num", m.underKill.num),
						zap.Int64("quota-under-reclaim", reclaiming),
						zap.Int64("rest-quota-to-reclaim", max(0, memToReclaim-reclaiming)),
					)
					m.actions.Warn("No more running root pool can be killed to resolve `OOM RISK`; KILL all awaiting tasks;",
						profile.fields[:profile.n]...,
					)
				} else {
					profile := m.recordDebugProfile()
					profile.append(
						zap.Int64("pool-under-kill-num", m.underKill.num),
						zap.Int64("quota-under-reclaim", reclaiming),
						zap.Int64("rest-quota-to-reclaim", max(0, memToReclaim-reclaiming)),
					)
					m.actions.Warn("No more running root pool or awaiting task can be terminated to resolve `OOM RISK`",
						profile.fields[:profile.n]...,
					)
				}
			}
		}
	} else {
		{ // warning
			profile := m.recordDebugProfile()
			profile.append(zap.Float64("heap-use-speed(MiB/s)",
				float64(heapUseBPS*100/byteSizeMB)/100),
				zap.Float64("required-speed(MiB/s)", float64(m.minHeapFreeBPS())/float64(byteSizeMB)))
			m.actions.Warn("Runtime memory free speed meets require, start re-check", profile.fields[:profile.n]...)
		}
	}

	if dur >= defHeapReclaimCheckDuration {
		m.heapController.memRisk.lastMemStats.heapTotalFree = m.heapController.heapTotalFree.Load()
		m.heapController.memRisk.lastMemStats.startTime = m.innerTime()
	}

	if !gcExecuted {
		m.gc()
	}
}

func memHangRisk(freeSpeedBPS, minHeapFreeSpeedBPS int64, now, startTime time.Time) bool {
	return freeSpeedBPS < minHeapFreeSpeedBPS || now.Sub(startTime) > defHeapReclaimCheckMaxDuration
}

func (m *MemArbitrator) killTopnEntry(required int64) (newKillNum int, reclaimed int64) {
	if m.underKill.num > 0 {
		now := m.innerTime()
		for uid, entry := range m.underKill.entries {
			ctx := &entry.arbitratorMu.underKill
			if ctx.fail {
				continue
			}
			if deadline := ctx.startTime.Add(defKillCancelCheckTimeout); now.Compare(deadline) >= 0 {
				m.actions.Error("Failed to `KILL` root pool due to timeout",
					zap.Uint64("uid", uid),
					zap.String("name", entry.pool.name),
					zap.Int64("mem-to-reclaim", ctx.reclaim),
					zap.String("mem-priority", entry.ctx.memPriority.String()),
					zap.Time("start-time", ctx.startTime),
					zap.Time("deadline", deadline),
				)
				ctx.fail = true
				continue
			}
			reclaimed += ctx.reclaim
		}
	}

	if reclaimed >= required {
		return
	}

	for prio := minArbitrationPriority; prio < maxArbitrationPriority; prio++ {
		for pos := m.entryMap.maxQuotaShardIndex - 1; pos >= m.entryMap.minQuotaShardIndexToCheck; pos-- {
			for uid, entry := range m.entryMap.quotaShards[prio][pos].entries {
				if entry.arbitratorMu.underKill.start || entry.notRunning() {
					continue
				}

				if ctx := entry.ctx.Load(); ctx.available() {
					memoryUsed := ctx.arbitrateHelper.HeapInuse()

					if memoryUsed <= 0 {
						continue
					}

					m.addUnderKill(entry, memoryUsed, m.innerTime())
					reclaimed += memoryUsed
					ctx.stop(ArbitratorOOMRiskKill)
					newKillNum++
					m.execMetrics.Risk.OOMKill[prio]++

					{ // warning
						m.actions.Warn("Start to `KILL` root pool",
							zap.Uint64("uid", uid),
							zap.String("name", entry.pool.name),
							zap.Int64("mem-used", memoryUsed),
							zap.String("mem-priority", ctx.memPriority.String()),
							zap.Int64("rest-to-reclaim", max(0, required-reclaimed)))
					}
					if m.removeTask(entry) {
						{ // warning
							m.actions.Warn("Make the mem quota subscription failed",
								zap.Uint64("uid", uid), zap.String("name", entry.pool.name))
						}
						entry.windUp(0, ArbitrateFail)
					}

					if reclaimed >= required {
						return
					}
				}
			}
		}
	}

	return
}

// LastRisk represents the last risk state of memory
type LastRisk struct {
	HeapAlloc  int64 `json:"heap"`
	QuotaAlloc int64 `json:"quota"`
}

// RuntimeMemStateV1 represents the runtime memory state
type RuntimeMemStateV1 struct {
	Version  int64    `json:"version"`
	LastRisk LastRisk `json:"last-risk"`
	// magnification ratio of heap-alloc/quota
	Magnif int64 `json:"magnif"`
	// medium quota usage of root pools
	PoolMediumCap int64 `json:"pool-medium-cap"`

	// TODO: top-n profiles by digest
	// topNProfiles [3][2]int64 `json:"top-n-profiles"`
}

func (m *MemArbitrator) recordMemState(s *RuntimeMemStateV1, reason string) error {
	m.heapController.memStateRecorder.Lock()
	defer m.heapController.memStateRecorder.Unlock()
	m.heapController.memStateRecorder.lastMemState.Store(s)

	if err := m.heapController.memStateRecorder.Store(s); err != nil {
		m.execMetrics.Action.RecordMemState.Fail++
		return err
	}
	m.execMetrics.Action.RecordMemState.Succ++
	m.actions.Info("Record mem state",
		zap.String("reason", reason),
		zap.String("data", fmt.Sprintf("%+v", s)),
	)
	return nil
}

// GetAwaitFreeBudgets returns the concurrent budget shard by the given uid
func (m *MemArbitrator) GetAwaitFreeBudgets(uid uint64) *TrackedConcurrentBudget {
	index := shardIndexByUID(uid, m.awaitFree.budget.sizeMask)
	return &m.awaitFree.budget.shards[index]
}

func (m *MemArbitrator) initAwaitFreePool(allocAlignSize, shardNum int64) {
	if allocAlignSize <= 0 {
		allocAlignSize = defAwaitFreePoolAllocAlignSize
	}

	p := &ResourcePool{
		name:           "awaitfree-pool",
		uid:            0,
		limit:          DefMaxLimit,
		allocAlignSize: allocAlignSize,

		maxUnusedBlocks: 0,
	}

	p.SetOutOfCapacityAction(func(s OutOfCapacityActionArgs) error {
		if m.heapController.heapAlloc.Load() > m.oomRisk()-s.Request ||
			m.allocated() > m.limit()-m.avoidance.size.Load()-s.Request {
			m.updateBlockedAt()
			m.execMetrics.AwaitFree.Fail++
			return errArbitrateFailError
		}

		m.alloc(s.Request)
		p.forceAddCap(s.Request)
		m.execMetrics.AwaitFree.Succ++

		return nil
	})

	m.awaitFree.pool = p

	{
		cnt := nextPow2(uint64(shardNum))
		m.awaitFree.budget.shards = make([]TrackedConcurrentBudget, cnt)
		m.awaitFree.budget.sizeMask = cnt - 1
		for i := range m.awaitFree.budget.shards {
			m.awaitFree.budget.shards[i].Pool = p
		}
	}
}

// ArbitratorStopReason represents the reason why the arbitrate helper will be stopped
type ArbitratorStopReason int

// ArbitrateHelperReason values
const (
	ArbitratorOOMRiskKill ArbitratorStopReason = iota
	ArbitratorWaitAverseCancel
	ArbitratorStandardCancel
	ArbitratorPriorityCancel
)

// String returns the string representation of the ArbitratorStopReason
func (r ArbitratorStopReason) String() (desc string) {
	switch r {
	case ArbitratorOOMRiskKill:
		desc = "KILL(out-of-memory)"
	case ArbitratorWaitAverseCancel:
		desc = "CANCEL(out-of-quota & wait-averse)"
	case ArbitratorStandardCancel:
		desc = "CANCEL(out-of-quota & standard-mode)"
	case ArbitratorPriorityCancel:
		desc = "CANCEL(out-of-quota & priority-mode)"
	}
	return
}

// ArbitrateHelper is an interface for the arbitrate helper
type ArbitrateHelper interface {
	Stop(ArbitratorStopReason) bool // kill by arbitrator only when meeting oom risk; cancel by arbitrator;
	HeapInuse() int64               // track heap usage
	Finish()
}

// ArbitrationContext represents the context & properties of the root pool
type ArbitrationContext struct {
	arbitrateHelper ArbitrateHelper
	cancelCh        <-chan struct{}
	PrevMaxMem      int64
	memQuotaLimit   int64
	memPriority     ArbitrationPriority
	stopped         atomic.Bool
	waitAverse      bool
	preferPrivilege bool
}

func (ctx *ArbitrationContext) available() bool {
	if ctx != nil && ctx.arbitrateHelper != nil && !ctx.stopped.Load() {
		return true
	}
	return false
}

func (ctx *ArbitrationContext) stop(reason ArbitratorStopReason) {
	if ctx.stopped.Swap(true) {
		return
	}
	ctx.arbitrateHelper.Stop(reason)
}

// NewArbitrationContext creates a new arbitration context
func NewArbitrationContext(
	cancelCh <-chan struct{},
	prevMaxMem, memQuotaLimit int64,
	arbitrateHelper ArbitrateHelper,
	memPriority ArbitrationPriority,
	waitAverse bool,
	preferPrivilege bool,
) *ArbitrationContext {
	return &ArbitrationContext{
		PrevMaxMem:      prevMaxMem,
		memQuotaLimit:   memQuotaLimit,
		cancelCh:        cancelCh,
		arbitrateHelper: arbitrateHelper,
		memPriority:     memPriority,
		waitAverse:      waitAverse,
		preferPrivilege: preferPrivilege,
	}
}

// NumByPattern represents the number of tasks by 4 pattern: priority(low, medium, high), wait-averse
type NumByPattern [maxArbitrateMode]int64

// TaskNumByPattern returns the number of tasks by pattern and there may be overlap
func (m *MemArbitrator) TaskNumByPattern() (res NumByPattern) {
	for i := minArbitrationPriority; i < maxArbitrationPriority; i++ {
		res[i] = m.taskNumByPriority(i)
	}
	res[ArbitrationWaitAverse] = m.taskNumOfWaitAverse()
	return
}

// ConsumeQuotaFromAwaitFreePool consumes quota from the awaitfree-pool by the given uid
func (m *MemArbitrator) ConsumeQuotaFromAwaitFreePool(uid uint64, req int64) bool {
	return m.GetAwaitFreeBudgets(uid).ConsumeQuota(m.approxUnixTimeSec(), req) == nil
}

// ConsumeQuota consumes quota from the concurrent budget
// req > 0: alloc quota; try to pull from upstream;
// req <= 0: release quota
func (b *ConcurrentBudget) ConsumeQuota(utimeSec int64, req int64) error {
	if req > 0 {
		if b.getLastUsedTimeSec() != utimeSec {
			b.setLastUsedTimeSec(utimeSec)
		}
		if b.Used.Add(req) > b.approxCapacity() {
			if err := b.PullFromUpstream(); err != nil {
				return err
			}
		}
	} else {
		b.Used.Add(req)
	}
	return nil
}

// ReportHeapInuseToAwaitFreePool reports the heap inuse to the awaitfree-pool by the given uid
func (m *MemArbitrator) ReportHeapInuseToAwaitFreePool(uid uint64, req int64) {
	m.GetAwaitFreeBudgets(uid).ReportHeapInuse(req)
}

// ReportHeapInuse reports the heap inuse to the concurrent budget
// req > 0: consume
// req < 0: release
func (b *TrackedConcurrentBudget) ReportHeapInuse(req int64) {
	b.HeapInuse.Add(req)
}

func (m *MemArbitrator) stop() bool {
	m.controlMu.Lock()
	defer m.controlMu.Unlock()

	if !m.controlMu.running.Load() {
		return false
	}

	m.controlMu.running.Store(false)
	m.wake()

	<-m.controlMu.finishCh

	m.runOneRound()

	return true
}

// AtMemRisk checks if the memory is under risk
func (m *MemArbitrator) AtMemRisk() bool {
	return m.atMemRisk()
}

// AtOOMRisk checks if the memory is under risk
//
//go:norace
func (m *MemArbitrator) AtOOMRisk() bool {
	return m.heapController.memRisk.oomRisk
}

func (m *MemArbitrator) atMemRisk() bool {
	return m.heapController.memRisk.startTime.nano.Load() != 0
}

func (m *MemArbitrator) intoOOMRisk() {
	m.heapController.memRisk.oomRisk = true
	m.execMetrics.Risk.OOM++
}

//go:norace
func (m *MemArbitrator) oomRisk() int64 {
	return m.mu.threshold.oomRisk
}

//go:norace
func (m *MemArbitrator) memRisk() int64 {
	return m.mu.threshold.risk
}

func (m *MemArbitrator) intoMemRisk() {
	now := m.innerTime()
	m.heapController.memRisk.startTime.t = now
	m.heapController.memRisk.startTime.nano.Store(now.UnixNano())
	m.heapController.memRisk.lastMemStats.heapTotalFree = m.heapController.heapTotalFree.Load()
	m.heapController.memRisk.lastMemStats.startTime = now
	m.execMetrics.Risk.Mem++

	{
		profile := m.recordDebugProfile()
		profile.append(zap.Int64("threshold", m.mu.threshold.oomRisk))
		m.actions.Warn("Memory inuse reach threshold", profile.fields[:profile.n]...)
	}

	{ // GC
		m.reclaimHeap()
	}

	if memState := m.calcMemRisk(); memState != nil {
		if memState.Magnif > defMaxMagnif {
			// There may be extreme memory leak issues. It's recommended to set soft limit manually.
			m.actions.Warn("Memory pressure is abnormally high",
				zap.Int64("mem-magnification-ratio(‰)", memState.Magnif),
				zap.Int64("upper-limit-ratio(‰)", defMaxMagnif))
			memState.Magnif = defMaxMagnif
		}
		{
			m.avoidance.memMagnif.Lock()

			m.doSetMemMagnif(memState.Magnif)

			m.avoidance.memMagnif.Unlock()
		}

		if err := m.recordMemState(memState, "oom risk"); err != nil {
			m.actions.Error("Failed to save mem-risk", zap.Error(err))
		}
	}

	if m.isMemNoRisk() {
		m.wake()
	}
}

func (m *MemArbitrator) setMemSafe() {
	m.heapController.memRisk.startTime.nano.Store(0)
	m.heapController.memRisk.oomRisk = false
}

//go:norace
func (m *MemArbitrator) setUnixTimeSec(s int64) {
	m.UnixTimeSec = s
}

func (m *MemArbitrator) approxUnixTimeSec() int64 {
	return m.UnixTimeSec
}