tidb/pkg/util/memory/tracker.go

// Copyright 2018 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package memory

import (
	"bytes"
	"fmt"
	"runtime"
	"slices"
	"strconv"
	"sync"
	"sync/atomic"
	"time"

	"github.com/pingcap/tidb/pkg/metrics"
	"github.com/pingcap/tidb/pkg/util/intest"
	"github.com/pingcap/tidb/pkg/util/sqlkiller"
	atomicutil "go.uber.org/atomic"
)

// TrackMemWhenExceeds is the threshold when memory usage needs to be tracked.
const TrackMemWhenExceeds = 104857600 // 100MB

// DefMemQuotaQuery is default memory quota for query.
const DefMemQuotaQuery = 1073741824 // 1GB

// Process global variables for memory limit.
var (
	ServerMemoryLimitOriginText  = atomicutil.NewString("0")
	ServerMemoryLimit            = atomicutil.NewUint64(0)
	ServerMemoryLimitSessMinSize = atomicutil.NewUint64(128 << 20)

	QueryForceDisk       = atomicutil.NewInt64(0)
	TriggerMemoryLimitGC = atomicutil.NewBool(false)
	MemoryLimitGCLast    = atomicutil.NewTime(time.Time{})
	MemoryLimitGCTotal   = atomicutil.NewInt64(0)
)

// Tracker is used to track the memory usage during query execution.
// It contains an optional limit and can be arranged into a tree structure
// such that the consumption tracked by a Tracker is also tracked by
// its ancestors. The main idea comes from Apache Impala:
//
// https://github.com/cloudera/Impala/blob/cdh5-trunk/be/src/runtime/mem-tracker.h
//
// By default, memory consumption is tracked via calls to "Consume()", either to
// the tracker itself or to one of its descendents. A typical sequence of calls
// for a single Tracker is:
// 1. tracker.SetLabel() / tracker.SetActionOnExceed() / tracker.AttachTo()
// 2. tracker.Consume() / tracker.ReplaceChild() / tracker.BytesConsumed()
//
// NOTE: We only protect concurrent access to "bytesConsumed" and "children",
// that is to say:
// 1. Only "BytesConsumed()", "Consume()" and "AttachTo()" are thread-safe.
// 2. Other operations of a Tracker tree is not thread-safe.
//
// We have two limits for the memory quota: soft limit and hard limit.
// If the soft limit is exceeded, we will trigger the action that alleviates the
// speed of memory growth. The soft limit is hard-coded as `0.8*hard limit`.
// The actions that could be triggered are: AggSpillDiskAction.
//
// If the hard limit is exceeded, we will trigger the action that immediately
// reduces memory usage. The hard limit is set by the system variable `tidb_mem_query_quota`.
// The actions that could be triggered are: SpillDiskAction, SortAndSpillDiskAction, rateLimitAction,
// PanicOnExceed, globalPanicOnExceed, LogOnExceed.
type Tracker struct {
	parent               atomic.Pointer[Tracker]
	MemArbitrator        *memArbitrator
	Killer               *sqlkiller.SQLKiller
	bytesLimit           atomic.Pointer[bytesLimits]
	actionMuForHardLimit actionMu
	actionMuForSoftLimit actionMu
	mu                   struct {
		// The children memory trackers. If the Tracker is the Global Tracker, like executor.GlobalDiskUsageTracker,
		// we wouldn't maintain its children in order to avoiding mutex contention.
		children map[int][]*Tracker
		sync.Mutex
	}
	label int // Label of this "Tracker".
	// following fields are used with atomic operations, so make them 64-byte aligned.
	bytesReleased       int64             // Released bytes.
	maxConsumed         atomicutil.Int64  // max number of bytes consumed during execution.
	SessionID           atomicutil.Uint64 // SessionID indicates the sessionID the tracker is bound.
	bytesConsumed       int64             // Consumed bytes.
	IsRootTrackerOfSess bool              // IsRootTrackerOfSess indicates whether this tracker is bound for session
	isGlobal            bool              // isGlobal indicates whether this tracker is global tracker
}

type actionMu struct {
	actionOnExceed ActionOnExceed
	sync.Mutex
}

// EnableGCAwareMemoryTrack is used to turn on/off the GC-aware memory track
var EnableGCAwareMemoryTrack = atomicutil.NewBool(false)

// https://golang.google.cn/pkg/runtime/#SetFinalizer
// It is not guaranteed that a finalizer will run if the size of *obj is zero bytes.
type finalizerRef struct {
	_ byte //nolint:unused
}

// softScale means the scale of the soft limit to the hard limit.
const softScale = 0.8

// bytesLimits holds limit config atomically.
type bytesLimits struct {
	bytesHardLimit int64 // bytesHardLimit <= 0 means no limit, used for actionMuForHardLimit.
	bytesSoftLimit int64 // bytesSoftLimit <= 0 means no limit, used for actionMuForSoftLimit.
}

var unlimitedBytesLimit = bytesLimits{
	bytesHardLimit: -1,
	bytesSoftLimit: -1,
}

var defaultQueryQuota = bytesLimits{
	bytesHardLimit: DefMemQuotaQuery,
	bytesSoftLimit: DefMemQuotaQuery * 8 / 10,
}

// MemUsageTop1Tracker record the use memory top1 session's tracker for kill.
var MemUsageTop1Tracker atomic.Pointer[Tracker]

var mockDebugInject func()

// InitTracker initializes a memory tracker.
//  1. "label" is the label used in the usage string.
//  2. "bytesLimit <= 0" means no limit.
//
// For the common tracker, isGlobal is default as false
func InitTracker(t *Tracker, label int, bytesLimit int64, action ActionOnExceed) {
	t.mu.children = nil
	t.actionMuForHardLimit.actionOnExceed = action
	t.actionMuForSoftLimit.actionOnExceed = nil
	t.parent.Store(nil)

	t.label = label
	if bytesLimit <= 0 {
		t.bytesLimit.Store(&unlimitedBytesLimit)
	} else if bytesLimit == DefMemQuotaQuery {
		t.bytesLimit.Store(&defaultQueryQuota)
	} else {
		t.bytesLimit.Store(&bytesLimits{
			bytesHardLimit: bytesLimit,
			bytesSoftLimit: int64(float64(bytesLimit) * softScale),
		})
	}
	t.maxConsumed.Store(0)
	t.isGlobal = false
	t.MemArbitrator = nil
}

// NewTracker creates a memory tracker.
//  1. "label" is the label used in the usage string.
//  2. "bytesLimit <= 0" means no limit.
//
// For the common tracker, isGlobal is default as false
func NewTracker(label int, bytesLimit int64) *Tracker {
	t := &Tracker{
		label: label,
	}
	t.bytesLimit.Store(&bytesLimits{
		bytesHardLimit: bytesLimit,
		bytesSoftLimit: int64(float64(bytesLimit) * softScale),
	})
	t.actionMuForHardLimit.actionOnExceed = &LogOnExceed{}
	t.isGlobal = false
	return t
}

// NewGlobalTracker creates a global tracker, its isGlobal is default as true
func NewGlobalTracker(label int, bytesLimit int64) *Tracker {
	t := &Tracker{
		label: label,
	}
	t.bytesLimit.Store(&bytesLimits{
		bytesHardLimit: bytesLimit,
		bytesSoftLimit: int64(float64(bytesLimit) * softScale),
	})
	t.actionMuForHardLimit.actionOnExceed = &LogOnExceed{}
	t.isGlobal = true
	return t
}

// CheckBytesLimit check whether the bytes limit of the tracker is equal to a value.
// Only used in test.
func (t *Tracker) CheckBytesLimit(val int64) bool {
	return t.bytesLimit.Load().bytesHardLimit == val
}

// SetBytesLimit sets the bytes limit for this tracker.
// "bytesHardLimit <= 0" means no limit.
func (t *Tracker) SetBytesLimit(bytesLimit int64) {
	if bytesLimit <= 0 {
		t.bytesLimit.Store(&unlimitedBytesLimit)
	} else if bytesLimit == DefMemQuotaQuery {
		t.bytesLimit.Store(&defaultQueryQuota)
	} else {
		t.bytesLimit.Store(&bytesLimits{
			bytesHardLimit: bytesLimit,
			bytesSoftLimit: int64(float64(bytesLimit) * softScale),
		})
	}
}

// GetBytesLimit gets the bytes limit for this tracker.
// "bytesHardLimit <= 0" means no limit.
func (t *Tracker) GetBytesLimit() int64 {
	return t.bytesLimit.Load().bytesHardLimit
}

// CheckExceed checks whether the consumed bytes is exceed for this tracker.
func (t *Tracker) CheckExceed() bool {
	bytesHardLimit := t.bytesLimit.Load().bytesHardLimit
	return atomic.LoadInt64(&t.bytesConsumed) >= bytesHardLimit && bytesHardLimit > 0
}

// SetActionOnExceed sets the action when memory usage exceeds bytesHardLimit.
func (t *Tracker) SetActionOnExceed(a ActionOnExceed) {
	t.actionMuForHardLimit.Lock()
	defer t.actionMuForHardLimit.Unlock()
	t.actionMuForHardLimit.actionOnExceed = a
}

// FallbackOldAndSetNewAction sets the action when memory usage exceeds bytesHardLimit
// and set the original action as its fallback.
func (t *Tracker) FallbackOldAndSetNewAction(a ActionOnExceed) {
	t.actionMuForHardLimit.Lock()
	defer t.actionMuForHardLimit.Unlock()
	t.actionMuForHardLimit.actionOnExceed = reArrangeFallback(a, t.actionMuForHardLimit.actionOnExceed)
}

// FallbackOldAndSetNewActionForSoftLimit sets the action when memory usage exceeds bytesSoftLimit
// and set the original action as its fallback.
func (t *Tracker) FallbackOldAndSetNewActionForSoftLimit(a ActionOnExceed) {
	t.actionMuForSoftLimit.Lock()
	defer t.actionMuForSoftLimit.Unlock()
	t.actionMuForSoftLimit.actionOnExceed = reArrangeFallback(a, t.actionMuForSoftLimit.actionOnExceed)
}

// GetFallbackForTest get the oom action used by test.
func (t *Tracker) GetFallbackForTest(ignoreFinishedAction bool) ActionOnExceed {
	t.actionMuForHardLimit.Lock()
	defer t.actionMuForHardLimit.Unlock()
	if t.actionMuForHardLimit.actionOnExceed != nil && t.actionMuForHardLimit.actionOnExceed.IsFinished() && ignoreFinishedAction {
		t.actionMuForHardLimit.actionOnExceed = t.actionMuForHardLimit.actionOnExceed.GetFallback()
	}
	return t.actionMuForHardLimit.actionOnExceed
}

// UnbindActions unbinds actionForHardLimit and actionForSoftLimit.
func (t *Tracker) UnbindActions() {
	t.actionMuForSoftLimit.Lock()
	defer t.actionMuForSoftLimit.Unlock()
	t.actionMuForSoftLimit.actionOnExceed = nil

	t.actionMuForHardLimit.Lock()
	defer t.actionMuForHardLimit.Unlock()
	// Currently this method is only called by ResetContextOfStmt, which then always calls SetActionOnExceed to set
	// actionForHardLimit.actionOnExceed properly, thus it's safe to set it nil here.
	t.actionMuForHardLimit.actionOnExceed = nil
}

// UnbindActionFromHardLimit unbinds action from hardLimit.
func (t *Tracker) UnbindActionFromHardLimit(actionToUnbind ActionOnExceed) {
	t.actionMuForHardLimit.Lock()
	defer t.actionMuForHardLimit.Unlock()

	var prev ActionOnExceed
	for current := t.actionMuForHardLimit.actionOnExceed; current != nil; current = current.GetFallback() {
		if current == actionToUnbind {
			if prev == nil {
				// actionToUnbind is the first element
				t.actionMuForHardLimit.actionOnExceed = current.GetFallback()
			} else {
				// actionToUnbind is not the first element
				prev.SetFallback(current.GetFallback())
			}
			break
		}
		prev = current
	}
}

// reArrangeFallback merge two action chains and rearrange them by priority in descending order.
func reArrangeFallback(a ActionOnExceed, b ActionOnExceed) ActionOnExceed {
	if a == nil {
		return b
	}
	if b == nil {
		return a
	}
	if a.GetPriority() < b.GetPriority() {
		a, b = b, a
	}
	a.SetFallback(reArrangeFallback(a.GetFallback(), b))
	return a
}

// SetLabel sets the label of a Tracker.
func (t *Tracker) SetLabel(label int) {
	parent := t.getParent()
	t.Detach()
	t.label = label
	if parent != nil {
		t.AttachTo(parent)
	}
}

// Label gets the label of a Tracker.
func (t *Tracker) Label() int {
	return t.label
}

// AttachTo attaches this memory tracker as a child to another Tracker. If it
// already has a parent, this function will remove it from the old parent.
// Its consumed memory usage is used to update all its ancestors.
func (t *Tracker) AttachTo(parent *Tracker) {
	if parent.isGlobal {
		t.AttachToGlobalTracker(parent)
		return
	}
	oldParent := t.getParent()
	if oldParent != nil {
		oldParent.remove(t)
	}
	parent.mu.Lock()
	if parent.mu.children == nil {
		parent.mu.children = make(map[int][]*Tracker)
	}
	parent.mu.children[t.label] = append(parent.mu.children[t.label], t)
	parent.mu.Unlock()

	t.setParent(parent)
	parent.Consume(t.BytesConsumed())
}

// Detach de-attach the tracker child from its parent, then set its parent property as nil
func (t *Tracker) Detach() {
	if t == nil {
		return
	}
	t.DetachMemArbitrator()
	parent := t.getParent()
	if parent == nil {
		return
	}
	if parent.isGlobal {
		t.DetachFromGlobalTracker()
		return
	}
	if parent.IsRootTrackerOfSess && t.label != LabelForMemDB {
		parent.actionMuForHardLimit.Lock()
		parent.actionMuForHardLimit.actionOnExceed = nil
		parent.actionMuForHardLimit.Unlock()

		parent.actionMuForSoftLimit.Lock()
		parent.actionMuForSoftLimit.actionOnExceed = nil
		parent.actionMuForSoftLimit.Unlock()
		parent.Killer.Reset()
	}
	parent.remove(t)
	t.setParent(nil) //atomic operator
}

func (t *Tracker) remove(oldChild *Tracker) {
	found := false
	label := oldChild.label
	t.mu.Lock()
	if t.mu.children != nil {
		children := t.mu.children[label]
		for i, child := range children {
			if child == oldChild {
				children = slices.Delete(children, i, i+1)
				if len(children) > 0 {
					t.mu.children[label] = children
				} else {
					delete(t.mu.children, label)
				}
				found = true
				break
			}
		}
	}
	t.mu.Unlock()
	if found {
		oldChild.setParent(nil)
		t.Consume(-oldChild.BytesConsumed())
	}
}

// ReplaceChild removes the old child specified in "oldChild" and add a new
// child specified in "newChild". old child's memory consumption will be
// removed and new child's memory consumption will be added.
func (t *Tracker) ReplaceChild(oldChild, newChild *Tracker) {
	if newChild == nil {
		t.remove(oldChild)
		return
	}

	if oldChild.label != newChild.label {
		t.remove(oldChild)
		newChild.AttachTo(t)
		return
	}

	newConsumed := newChild.BytesConsumed()
	newChild.setParent(t)

	label := oldChild.label
	t.mu.Lock()
	if t.mu.children != nil {
		children := t.mu.children[label]
		for i, child := range children {
			if child != oldChild {
				continue
			}

			newConsumed -= oldChild.BytesConsumed()
			oldChild.setParent(nil)
			children[i] = newChild
			t.mu.children[label] = children
			break
		}
	}
	t.mu.Unlock()

	t.Consume(newConsumed)
}

// Consume is used to consume a memory usage. "bytes" can be a negative value,
// which means this is a memory release operation. When memory usage of a tracker
// exceeds its bytesSoftLimit/bytesHardLimit, the tracker calls its action, so does each of its ancestors.
func (t *Tracker) Consume(bs int64) {
	if bs == 0 {
		return
	}
	var rootExceed, rootExceedForSoftLimit, sessionRootTracker *Tracker
	for tracker := t; tracker != nil; tracker = tracker.getParent() {
		if tracker.IsRootTrackerOfSess {
			sessionRootTracker = tracker
		}
		if m := tracker.MemArbitrator; m != nil {
			if bs > 0 {
				if m.useBigBudget() {
					goto useBigBudget
				}
				{ // fast path for small budget
					if m.addSmallBudget(bs) > m.budget.smallLimit {
						m.addSmallBudget(-bs)
						goto initBigBudget
					}
					b := m.smallBudget()
					if t := m.approxUnixTimeSec(); b.getLastUsedTimeSec() != t {
						b.setLastUsedTimeSec(t)
					}
					if b.Used.Load() > b.approxCapacity() && b.PullFromUpstream() != nil {
						goto initBigBudget
					}
					goto endUseBudget
				}
			initBigBudget:
				m.initBigBudget()
			useBigBudget:
				if m.addBigBudgetUsed(bs) > m.bigBudgetGrowThreshold() {
					m.growBigBudget()
				}
			endUseBudget: // nop
			} else if m.useBigBudget() { // delta <= 0 && use big budget
				m.addBigBudgetUsed(bs)
			} else { // delta <= 0 && use small budget
				m.addSmallBudget(bs)
			}
		}
		bytesConsumed := atomic.AddInt64(&tracker.bytesConsumed, bs)
		bytesReleased := atomic.LoadInt64(&tracker.bytesReleased)
		limits := tracker.bytesLimit.Load()
		if bytesConsumed+bytesReleased >= limits.bytesHardLimit && limits.bytesHardLimit > 0 {
			rootExceed = tracker
		}
		if bytesConsumed+bytesReleased >= limits.bytesSoftLimit && limits.bytesSoftLimit > 0 {
			rootExceedForSoftLimit = tracker
		}

		for {
			maxNow := tracker.maxConsumed.Load()
			consumed := atomic.LoadInt64(&tracker.bytesConsumed)
			if consumed > maxNow && !tracker.maxConsumed.CompareAndSwap(maxNow, consumed) {
				continue
			}
			if tracker.label == LabelForGlobalAnalyzeMemory {
				// `LabelForGlobalAnalyzeMemory` represents in-use memory, which should never be negative.
				intest.Assert(consumed >= 0, fmt.Sprintf("global analyze memory usage negative: %d", consumed))
			}
			if label, ok := MetricsTypes[tracker.label]; ok {
				metrics.MemoryUsage.WithLabelValues(label[0], label[1]).Set(float64(consumed))
			}
			break
		}
	}

	tryAction := func(mu *actionMu, tracker *Tracker) {
		mu.Lock()
		defer mu.Unlock()
		for mu.actionOnExceed != nil && mu.actionOnExceed.IsFinished() {
			mu.actionOnExceed = mu.actionOnExceed.GetFallback()
		}
		if mu.actionOnExceed != nil {
			mu.actionOnExceed.Action(tracker)
		}
	}

	if bs > 0 && !UsingGlobalMemArbitration() && sessionRootTracker != nil {
		// Update the Top1 session
		memUsage := sessionRootTracker.BytesConsumed()
		limitSessMinSize := ServerMemoryLimitSessMinSize.Load()
		if uint64(memUsage) >= limitSessMinSize {
			oldTracker := MemUsageTop1Tracker.Load()
			for oldTracker.LessThan(sessionRootTracker) {
				if MemUsageTop1Tracker.CompareAndSwap(oldTracker, sessionRootTracker) {
					break
				}
				oldTracker = MemUsageTop1Tracker.Load()
			}
		}
	}

	if bs > 0 && sessionRootTracker != nil {
		err := sessionRootTracker.Killer.HandleSignal()
		if err != nil {
			panic(err)
		}
	}

	if bs > 0 && rootExceed != nil {
		tryAction(&rootExceed.actionMuForHardLimit, rootExceed)
	}

	if bs > 0 && rootExceedForSoftLimit != nil {
		tryAction(&rootExceedForSoftLimit.actionMuForSoftLimit, rootExceedForSoftLimit)
	}
}

// HandleKillSignal checks if a kill signal has been sent to the session root tracker.
// If a kill signal is detected, it panics with the error returned by the signal handler.
func (t *Tracker) HandleKillSignal() {
	var sessionRootTracker *Tracker
	for tracker := t; tracker != nil; tracker = tracker.getParent() {
		if tracker.IsRootTrackerOfSess {
			sessionRootTracker = tracker
		}
	}
	if sessionRootTracker != nil {
		err := sessionRootTracker.Killer.HandleSignal()
		if err != nil {
			panic(err)
		}
	}
}

// BufferedConsume is used to buffer memory usage and do late consume
// not thread-safe, should be called in one goroutine
func (t *Tracker) BufferedConsume(bufferedMemSize *int64, bytes int64) {
	*bufferedMemSize += bytes
	if *bufferedMemSize >= int64(TrackMemWhenExceeds) {
		t.Consume(*bufferedMemSize)
		*bufferedMemSize = int64(0)
	}
}

// Release is used to release memory tracked, track the released memory until GC triggered if needed
// If you want your track to be GC-aware, please use Release(bytes) instead of Consume(-bytes), and pass the memory size of the real object.
// Only Analyze is integrated with Release so far.
func (t *Tracker) Release(bytes int64) {
	if bytes == 0 {
		return
	}
	defer t.Consume(-bytes)
	for tracker := t; tracker != nil; tracker = tracker.getParent() {
		if tracker.shouldRecordRelease() {
			// use fake ref instead of obj ref, otherwise obj will be reachable again and gc in next cycle
			newRef := &finalizerRef{}
			finalizer := func(tracker *Tracker) func(ref *finalizerRef) {
				return func(*finalizerRef) {
					tracker.release(bytes) // finalizer func is called async
				}
			}
			runtime.SetFinalizer(newRef, finalizer(tracker))
			tracker.recordRelease(bytes)
			return
		}
	}
}

// BufferedRelease is used to buffer memory release and do late release
// not thread-safe, should be called in one goroutine
func (t *Tracker) BufferedRelease(bufferedMemSize *int64, bytes int64) {
	*bufferedMemSize += bytes
	if *bufferedMemSize >= int64(TrackMemWhenExceeds) {
		t.Release(*bufferedMemSize)
		*bufferedMemSize = int64(0)
	}
}

func (t *Tracker) shouldRecordRelease() bool {
	return EnableGCAwareMemoryTrack.Load() && t.label == LabelForGlobalAnalyzeMemory
}

func (t *Tracker) recordRelease(bytes int64) {
	for tracker := t; tracker != nil; tracker = tracker.getParent() {
		bytesReleased := atomic.AddInt64(&tracker.bytesReleased, bytes)
		if label, ok := MetricsTypes[tracker.label]; ok {
			metrics.MemoryUsage.WithLabelValues(label[0], label[2]).Set(float64(bytesReleased))
		}
	}
}

func (t *Tracker) release(bytes int64) {
	for tracker := t; tracker != nil; tracker = tracker.getParent() {
		bytesReleased := atomic.AddInt64(&tracker.bytesReleased, -bytes)
		if label, ok := MetricsTypes[tracker.label]; ok {
			metrics.MemoryUsage.WithLabelValues(label[0], label[2]).Set(float64(bytesReleased))
		}
	}
}

// BytesConsumed returns the consumed memory usage value in bytes.
func (t *Tracker) BytesConsumed() int64 {
	return atomic.LoadInt64(&t.bytesConsumed)
}

// BytesReleased returns the released memory value in bytes.
func (t *Tracker) BytesReleased() int64 {
	return atomic.LoadInt64(&t.bytesReleased)
}

// MaxConsumed returns max number of bytes consumed during execution.
// Note: Don't make this method return -1 for special meanings in the future. Because binary plan has used -1 to
// distinguish between "0 bytes" and "N/A". ref: binaryOpFromFlatOp()
func (t *Tracker) MaxConsumed() int64 {
	return t.maxConsumed.Load()
}

// ResetMaxConsumed should be invoked before executing a new statement in a session.
func (t *Tracker) ResetMaxConsumed() {
	t.maxConsumed.Store(t.BytesConsumed())
}

// SearchTrackerWithoutLock searches the specific tracker under this tracker without lock.
func (t *Tracker) SearchTrackerWithoutLock(label int) *Tracker {
	if t.label == label {
		return t
	}
	children := t.mu.children[label]
	if len(children) > 0 {
		return children[0]
	}
	return nil
}

// SearchTrackerConsumedMoreThanNBytes searches the specific tracker that consumes more than NBytes.
func (t *Tracker) SearchTrackerConsumedMoreThanNBytes(limit int64) (res []*Tracker) {
	t.mu.Lock()
	defer t.mu.Unlock()
	for _, childSlice := range t.mu.children {
		for _, tracker := range childSlice {
			if tracker.BytesConsumed() > limit {
				res = append(res, tracker)
			}
		}
	}
	return
}

// String returns the string representation of this Tracker tree.
func (t *Tracker) String() string {
	buffer := bytes.NewBufferString("\n")
	t.toString("", buffer)
	return buffer.String()
}

func (t *Tracker) toString(indent string, buffer *bytes.Buffer) {
	fmt.Fprintf(buffer, "%s\"%d\"{\n", indent, t.label)
	bytesLimit := t.GetBytesLimit()
	if bytesLimit > 0 {
		fmt.Fprintf(buffer, "%s  \"quota\": %s\n", indent, t.FormatBytes(bytesLimit))
	}
	fmt.Fprintf(buffer, "%s  \"consumed\": %s\n", indent, t.FormatBytes(t.BytesConsumed()))

	t.mu.Lock()
	labels := make([]int, 0, len(t.mu.children))
	for label := range t.mu.children {
		labels = append(labels, label)
	}
	slices.Sort(labels)
	for _, label := range labels {
		children := t.mu.children[label]
		for _, child := range children {
			child.toString(indent+"  ", buffer)
		}
	}
	t.mu.Unlock()
	buffer.WriteString(indent + "}\n")
}

// FormatBytes uses to format bytes, this function will prune precision before format bytes.
func (*Tracker) FormatBytes(numBytes int64) string {
	return FormatBytes(numBytes)
}

// LessThan indicates whether t byteConsumed is less than t2 byteConsumed.
func (t *Tracker) LessThan(t2 *Tracker) bool {
	if t == nil {
		return true
	}
	if t2 == nil {
		return false
	}
	return t.BytesConsumed() < t2.BytesConsumed()
}

// BytesToString converts the memory consumption to a readable string.
func BytesToString(numBytes int64) string {
	gb := float64(numBytes) / float64(byteSizeGB)
	if gb > 1 {
		return fmt.Sprintf("%v GB", gb)
	}

	mb := float64(numBytes) / float64(byteSizeMB)
	if mb > 1 {
		return fmt.Sprintf("%v MB", mb)
	}

	kb := float64(numBytes) / float64(byteSizeKB)
	if kb > 1 {
		return fmt.Sprintf("%v KB", kb)
	}

	return fmt.Sprintf("%v Bytes", numBytes)
}

// FormatBytes uses to format bytes, this function will prune precision before format bytes.
func FormatBytes(numBytes int64) string {
	if numBytes <= byteSizeKB {
		return BytesToString(numBytes)
	}
	unit, unitStr := getByteUnit(numBytes)
	if unit == byteSize {
		return BytesToString(numBytes)
	}
	v := float64(numBytes) / float64(unit)
	decimal := 1
	if numBytes%unit == 0 {
		decimal = 0
	} else if v < 10 {
		decimal = 2
	}
	return fmt.Sprintf("%v %s", strconv.FormatFloat(v, 'f', decimal, 64), unitStr)
}

func getByteUnit(b int64) (int64, string) {
	if b > byteSizeGB {
		return byteSizeGB, "GB"
	} else if b > byteSizeMB {
		return byteSizeMB, "MB"
	} else if b > byteSizeKB {
		return byteSizeKB, "KB"
	}
	return byteSize, "Bytes"
}

// AttachToGlobalTracker attach the tracker to the global tracker
// AttachToGlobalTracker should be called at the initialization for the session executor's tracker
func (t *Tracker) AttachToGlobalTracker(globalTracker *Tracker) {
	if globalTracker == nil {
		return
	}
	if !globalTracker.isGlobal {
		panic("Attach to a non-GlobalTracker")
	}
	parent := t.getParent()
	if parent != nil {
		if parent.isGlobal {
			parent.Consume(-t.BytesConsumed())
		} else {
			parent.remove(t)
		}
	}
	t.setParent(globalTracker)
	globalTracker.Consume(t.BytesConsumed())
}

// DetachFromGlobalTracker detach itself from its parent
// Note that only the parent of this tracker is Global Tracker could call this function
// Otherwise it should use Detach
func (t *Tracker) DetachFromGlobalTracker() {
	parent := t.getParent()
	if parent == nil {
		return
	}
	if !parent.isGlobal {
		panic("Detach from a non-GlobalTracker")
	}
	parent.Consume(-t.BytesConsumed())
	t.setParent(nil)
}

// ReplaceBytesUsed replace bytesConsume for the tracker
func (t *Tracker) ReplaceBytesUsed(bytes int64) {
	t.Consume(bytes - t.BytesConsumed())
}

// Reset detach the tracker from the old parent and clear the old children. The label and byteLimit would not be reset.
func (t *Tracker) Reset() {
	t.Detach()
	t.ReplaceBytesUsed(0)
	t.mu.children = nil
	t.resetMemArbitrator()
}

func (t *Tracker) getParent() *Tracker {
	return t.parent.Load()
}

func (t *Tracker) setParent(parent *Tracker) {
	t.parent.Store(parent)
}

// CountAllChildrenMemUse return memory used tree for the tracker
func (t *Tracker) CountAllChildrenMemUse() map[string]int64 {
	trackerMemUseMap := make(map[string]int64, 1024)
	countChildMem(t, "", trackerMemUseMap)
	return trackerMemUseMap
}

// GetChildrenForTest returns children trackers
func (t *Tracker) GetChildrenForTest() []*Tracker {
	t.mu.Lock()
	defer t.mu.Unlock()
	trackers := make([]*Tracker, 0)
	for _, list := range t.mu.children {
		trackers = append(trackers, list...)
	}
	return trackers
}

func countChildMem(t *Tracker, familyTreeName string, trackerMemUseMap map[string]int64) {
	if len(familyTreeName) > 0 {
		familyTreeName += " <- "
	}
	familyTreeName += "[" + strconv.Itoa(t.Label()) + "]"
	trackerMemUseMap[familyTreeName] += t.BytesConsumed()
	t.mu.Lock()
	defer t.mu.Unlock()
	for _, sli := range t.mu.children {
		for _, tracker := range sli {
			countChildMem(tracker, familyTreeName, trackerMemUseMap)
		}
	}
}

const (
	// LabelForSQLText represents the label of the SQL Text
	LabelForSQLText int = -1
	// LabelForIndexWorker represents the label of the index worker
	LabelForIndexWorker int = -2
	// LabelForInnerList represents the label of the inner list
	LabelForInnerList int = -3
	// LabelForInnerTable represents the label of the inner table
	LabelForInnerTable int = -4
	// LabelForOuterTable represents the label of the outer table
	LabelForOuterTable int = -5
	// LabelForCoprocessor represents the label of the coprocessor
	LabelForCoprocessor int = -6
	// LabelForChunkList represents the label of the chunk list
	LabelForChunkList int = -7
	// LabelForGlobalSimpleLRUCache represents the label of the Global SimpleLRUCache
	LabelForGlobalSimpleLRUCache int = -8
	// LabelForChunkDataInDiskByRows represents the label of the chunk list in disk
	LabelForChunkDataInDiskByRows int = -9
	// LabelForRowContainer represents the label of the row container
	LabelForRowContainer int = -10
	// LabelForGlobalStorage represents the label of the Global Storage
	LabelForGlobalStorage int = -11
	// LabelForGlobalMemory represents the label of the Global Memory
	LabelForGlobalMemory int = -12
	// LabelForBuildSideResult represents the label of the BuildSideResult
	LabelForBuildSideResult int = -13
	// LabelForRowChunks represents the label of the row chunks
	LabelForRowChunks int = -14
	// LabelForStatsCache represents the label of the stats cache
	LabelForStatsCache int = -15
	// LabelForOuterList represents the label of the outer list
	LabelForOuterList int = -16
	// LabelForApplyCache represents the label of the apply cache
	LabelForApplyCache int = -17
	// LabelForSimpleTask represents the label of the simple task
	LabelForSimpleTask int = -18
	// LabelForCTEStorage represents the label of CTE storage
	LabelForCTEStorage int = -19
	// LabelForIndexJoinInnerWorker represents the label of IndexJoin InnerWorker
	LabelForIndexJoinInnerWorker int = -20
	// LabelForIndexJoinOuterWorker represents the label of IndexJoin OuterWorker
	LabelForIndexJoinOuterWorker int = -21
	// LabelForBindCache represents the label of the bind cache
	LabelForBindCache int = -22
	// LabelForNonTransactionalDML represents the label of the non-transactional DML
	LabelForNonTransactionalDML = -23
	// LabelForAnalyzeMemory represents the label of the memory of each analyze job
	LabelForAnalyzeMemory int = -24
	// LabelForGlobalAnalyzeMemory represents the label of the global memory of all analyze jobs
	LabelForGlobalAnalyzeMemory int = -25
	// LabelForPreparedPlanCache represents the label of the prepared plan cache memory usage
	LabelForPreparedPlanCache int = -26
	// LabelForSession represents the label of a session.
	LabelForSession int = -27
	// LabelForMemDB represents the label of the MemDB
	LabelForMemDB int = -28
	// LabelForCursorFetch represents the label of the execution of cursor fetch
	LabelForCursorFetch int = -29
	// LabelForChunkDataInDiskByChunks represents the label of the chunk list in disk
	LabelForChunkDataInDiskByChunks int = -30
	// LabelForSortPartition represents the label of the sort partition
	LabelForSortPartition int = -31
	// LabelForHashTableInHashJoinV2 represents the label of the hash join v2's hash table
	LabelForHashTableInHashJoinV2 int = -32
)

// MetricsTypes is used to get label for metrics
// string[0] is LblModule, string[1] is heap-in-use type, string[2] is released type
var MetricsTypes = map[int][]string{
	LabelForGlobalAnalyzeMemory: {"analyze", "inuse", "released"},
}

const (
	memArbitratorStateSmallBudget   int32 = iota // using small budget
	memArbitratorStateIntoBigBudget              // initializing big budget from small budget
	memArbitratorStateBigBudget                  // using big budget
	memArbitratorStateDown                       // down
)

type memArbitrator struct {
	*MemArbitrator
	ctx    *ArbitrationContext
	killer *sqlkiller.SQLKiller
	budget struct {
		smallB *TrackedConcurrentBudget
		mu     struct {
			bigB      ConcurrentBudget // bigB.Used (aks growThreshold): threshold to pull from upstream (95% * bigB.Capacity)
			bigUsed   atomic.Int64     // bigUsed <= growThreshold <= bigB.Capacity
			smallUsed atomic.Int64
			_         cpuCacheLinePad
		}
		smallLimit int64
		useBig     struct {
			sync.Mutex
			atomic.Bool
		}
	}
	uid         uint64
	digestID    uint64 // identify the digest profile of root-pool / SQL
	reserveSize int64
	isInternal  bool
	state       atomic.Int32 // states: the current state of memArbitrator

	AwaitAlloc struct {
		TotalDur   atomic.Int64 // total time spent waiting for memory allocation in nanoseconds
		StartUtime int64        // start time of the last allocation attempt in nanoseconds.
		Size       int64        // size of the last allocation attempt in bytes. 0 means no allocation attempt is in progress.
	}
}

func (m *memArbitrator) bigBudget() *ConcurrentBudget {
	return &m.budget.mu.bigB
}

func (m *memArbitrator) smallBudget() *TrackedConcurrentBudget {
	return m.budget.smallB
}

func (m *memArbitrator) bigBudgetGrowThreshold() int64 {
	return m.bigBudget().Used.Load()
}

func (m *memArbitrator) bigBudgetCap() int64 {
	return m.bigBudget().approxCapacity()
}

func (m *memArbitrator) bigBudgetUsed() int64 {
	return m.budget.mu.bigUsed.Load()
}

func (m *memArbitrator) setBigBudgetGrowThreshold(x int64) {
	m.bigBudget().Used.Store(x)
}

func (m *memArbitrator) doSetBigBudgetCap(x int64) {
	m.bigBudget().Capacity = x
}

func (m *memArbitrator) addBigBudgetUsed(d int64) int64 {
	return m.budget.mu.bigUsed.Add(d)
}

func (m *memArbitrator) smallBudgetUsed() int64 {
	return m.budget.mu.smallUsed.Load()
}

func (m *memArbitrator) addSmallBudget(d int64) int64 {
	m.smallBudget().HeapInuse.Add(d)
	m.smallBudget().Used.Add(d)
	return m.budget.mu.smallUsed.Add(d)
}

func (m *memArbitrator) cleanSmallBudget() (res int64) {
	res = m.budget.mu.smallUsed.Swap(0)
	m.smallBudget().HeapInuse.Add(-res)
	m.smallBudget().Used.Add(-res)
	return res
}

func (m *memArbitrator) useBigBudget() bool {
	return m.budget.useBig.Load()
}

// MemArbitration returns the time cost of memory arbitration in nanoseconds
func (t *Tracker) MemArbitration() time.Duration {
	if t == nil {
		return 0
	}
	m := t.MemArbitrator
	if m == nil {
		return 0
	}
	return time.Duration(m.AwaitAlloc.TotalDur.Load())
}

// WaitArbitrate returns the start time and size of the last memory allocation attempt.
func (t *Tracker) WaitArbitrate() (ts time.Time, size int64) {
	if t == nil {
		return
	}
	m := t.MemArbitrator
	if m == nil {
		return
	}
	return time.Unix(0, m.AwaitAlloc.StartUtime), m.AwaitAlloc.Size
}

func (m *memArbitrator) growBigBudget() {
	duration := int64(0)
	{
		upper := m.bigBudget()
		upper.Lock()

		used, growThreshold, capacity := m.bigBudgetUsed(), m.bigBudgetGrowThreshold(), m.bigBudgetCap()
		if used > growThreshold {
			// expect next cap := used * 2.718
			extra := max(((used*2783)>>10)-capacity, upper.Pool.allocAlignSize)
			extra = min(extra, m.poolAllocStats.MaxPoolAllocUnit)
			extra = max(extra, used-capacity)
			m.AwaitAlloc.StartUtime = time.Now().UnixNano()
			m.AwaitAlloc.Size = extra
			if err := upper.Pool.allocate(extra); err == nil {
				capacity += extra
				m.doSetBigBudgetCap(capacity)
				m.setBigBudgetGrowThreshold(max(capacity*95/100, used))
			}
			duration = time.Now().UnixNano() - m.AwaitAlloc.StartUtime
			m.AwaitAlloc.StartUtime = 0
			m.AwaitAlloc.Size = 0
		}

		upper.Unlock()
	}

	if duration > 0 {
		m.AwaitAlloc.TotalDur.Add(duration)
		metrics.GlobalMemArbitrationDuration.Observe(time.Duration(duration).Seconds())
	}
}

func (m *memArbitrator) initBigBudget() {
	m.budget.useBig.Lock()
	defer m.budget.useBig.Unlock()

	if m.useBigBudget() {
		return
	}

	if smallUsed := m.smallBudgetUsed(); smallUsed > 0 {
		m.addBigBudgetUsed(smallUsed)
		defer m.cleanSmallBudget()
	}

	root, err := m.EmplaceRootPool(m.uid)
	if err != nil {
		panic(err)
	}

	if m.isInternal {
		globalArbitrator.metrics.pools.internalSession.Add(1)
	}

	if !root.Restart(m.ctx) || !m.state.CompareAndSwap(memArbitratorStateSmallBudget, memArbitratorStateIntoBigBudget) {
		panic("failed to init mem pool")
	}

	{
		globalArbitrator.metrics.pools.small.Add(-1)
		globalArbitrator.metrics.pools.intoBig.Add(1)
	}

	if intest.InTest {
		if mockDebugInject != nil {
			mockDebugInject()
		}
	}

	m.bigBudget().Pool = root.entry.pool

	if m.reserveSize > 0 {
		m.reserveBigBudget(m.reserveSize)
		metrics.GlobalMemArbitratorSubEvents.PoolInitReserve.Inc()
	} else if m.ctx.PrevMaxMem > 0 {
		metrics.GlobalMemArbitratorSubEvents.PoolInitHitDigest.Inc()
		m.reserveBigBudget(m.ctx.PrevMaxMem)
	} else if m.bigBudgetUsed() > m.poolAllocStats.SmallPoolLimit {
		if initCap := m.SuggestPoolInitCap(); initCap != 0 {
			m.reserveBigBudget(initCap)
			metrics.GlobalMemArbitratorSubEvents.PoolInitMediumQuota.Inc()
		}
	}

	if m.bigBudgetCap() == 0 {
		metrics.GlobalMemArbitratorSubEvents.PoolInitNone.Inc()
	}

	m.budget.useBig.Store(true)

	if intest.InTest {
		if mockDebugInject != nil {
			mockDebugInject()
		}
	}

	if m.state.CompareAndSwap(memArbitratorStateIntoBigBudget, memArbitratorStateBigBudget) {
		globalArbitrator.metrics.pools.intoBig.Add(-1)
		globalArbitrator.metrics.pools.big.Add(1)
	}
}

func (m *memArbitrator) reserveBigBudget(newCap int64) {
	duration := int64(0)
	{
		upper := m.bigBudget()
		upper.Lock()

		capacity := m.bigBudgetCap()
		extra := max(newCap*1053/1000, m.bigBudgetGrowThreshold(), capacity, m.bigBudgetUsed()) - capacity
		m.AwaitAlloc.StartUtime = time.Now().UnixNano()
		m.AwaitAlloc.Size = extra
		if err := upper.Pool.allocate(extra); err == nil {
			capacity += extra
			m.doSetBigBudgetCap(capacity)
			m.setBigBudgetGrowThreshold(capacity * 95 / 100)
		}
		duration = time.Now().UnixNano() - m.AwaitAlloc.StartUtime
		m.AwaitAlloc.StartUtime = 0
		m.AwaitAlloc.Size = 0

		upper.Unlock()
	}

	if duration > 0 {
		m.AwaitAlloc.TotalDur.Add(duration)
		metrics.GlobalMemArbitrationDuration.Observe(time.Duration(duration).Seconds())
	}
}

func (t *Tracker) resetMemArbitrator() {
	t.MemArbitrator = nil
}

// DetachMemArbitrator detaches the mem arbitrator from the tracker and cleans up related resources.
func (t *Tracker) DetachMemArbitrator() bool {
	m := t.MemArbitrator
	if m == nil {
		return false
	}

	if m.smallBudgetUsed() != 0 {
		m.cleanSmallBudget()
	}

	if m.state.Load() == memArbitratorStateDown {
		return false
	}

	switch m.state.Swap(memArbitratorStateDown) {
	case memArbitratorStateSmallBudget:
		globalArbitrator.metrics.pools.small.Add(-1)
	case memArbitratorStateIntoBigBudget:
		{
			m.budget.useBig.Lock() // wait for initBigBudget to finish

			globalArbitrator.metrics.pools.intoBig.Add(-1)

			m.budget.useBig.Unlock()
		}
	case memArbitratorStateBigBudget:
		globalArbitrator.metrics.pools.big.Add(-1)
	default:
		return false
	}

	if m.isInternal {
		globalArbitrator.metrics.pools.internal.Add(-1)
	}

	killed := false
	if m.killer != nil {
		killed = m.killer.Signal != 0
	}
	maxConsumed := t.maxConsumed.Load()

	if !killed {
		m.UpdateDigestProfileCache(m.digestID, maxConsumed, m.approxUnixTimeSec())
	}

	if m.useBigBudget() {
		m.bigBudget().Stop()
		m.ResetRootPoolByID(m.uid, maxConsumed, !killed)
	}
	return true
}

// InitMemArbitratorForTest is a simplified version of InitMemArbitrator for test usage.
func (t *Tracker) InitMemArbitratorForTest() bool {
	return t.InitMemArbitrator(GlobalMemArbitrator(), 0, nil, "", ArbitrationPriorityMedium, false, 0, false)
}

// InitMemArbitrator attaches (not thread-safe) to the mem arbitrator and initializes the context
// "m" is the mem-arbitrator.
// "memQuotaQuery" is the maximum memory quota for query.
// "killer" is the sql killer.
// "digestKey" is the digest key.
// "memPriority" is the memory priority for arbitration.
// "waitAverse" represents the wait averse property.
// "explicitReserveSize" is the explicit mem quota size to be reserved.
// "isInternal" indicates whether the tracker is for internal session.
func (t *Tracker) InitMemArbitrator(
	g *MemArbitrator,
	memQuotaQuery int64,
	killer *sqlkiller.SQLKiller,
	digestKey string,
	memPriority ArbitrationPriority,
	waitAverse bool,
	explicitReserveSize int64,
	isInternal bool,
) bool {
	if g == nil || t == nil || t.MemArbitrator != nil {
		return false
	}

	uid := t.SessionID.Load()
	digestID := HashStr(digestKey)
	prevMaxMem := int64(0)

	if explicitReserveSize == 0 && len(digestKey) > 0 {
		if maxMem, found := g.GetDigestProfileCache(digestID, g.approxUnixTimeSec()); found {
			prevMaxMem = maxMem
		}
	}

	var cancelChan <-chan struct{}
	if killer != nil {
		cancelChan = killer.GetKillEventChan()
	}
	ctx := NewArbitrationContext(
		cancelChan,
		prevMaxMem,
		memQuotaQuery,
		&trackerArbitrateHelper{
			tracker: t,
		},
		memPriority,
		waitAverse,
		true,
	)

	m := &memArbitrator{
		MemArbitrator: g,
		uid:           uid,
		killer:        killer,
		digestID:      digestID,
		reserveSize:   explicitReserveSize,
		ctx:           ctx,
		isInternal:    isInternal,
	}
	t.MemArbitrator = m

	globalArbitrator.metrics.pools.small.Add(1)
	if m.isInternal {
		globalArbitrator.metrics.pools.internal.Add(1)
	}

	if explicitReserveSize > 0 || prevMaxMem > g.poolAllocStats.SmallPoolLimit {
		m.initBigBudget()
	} else {
		m.budget.smallB = g.GetAwaitFreeBudgets(uid)
		m.budget.smallLimit = g.poolAllocStats.SmallPoolLimit
	}

	return true
}

type trackerArbitrateHelper struct {
	tracker *Tracker
	killed  atomic.Bool
}

func (h *trackerArbitrateHelper) Finish() {
	t := h.tracker
	t.DetachMemArbitrator()
	if t.MemArbitrator.isInternal {
		globalArbitrator.metrics.pools.internalSession.Add(-1)
	}
}

func (h *trackerArbitrateHelper) Stop(reason ArbitratorStopReason) bool {
	if h.killed.Load() || h.killed.Swap(true) {
		return false
	}
	for tracker := h.tracker; tracker != nil; tracker = tracker.getParent() {
		if tracker.IsRootTrackerOfSess && tracker.Killer != nil {
			tracker.Killer.SendKillSignalWithKillEventReason(sqlkiller.KilledByMemArbitrator, reason.String())
			break
		}
	}
	return true
}

func (h *trackerArbitrateHelper) HeapInuse() int64 {
	return h.tracker.BytesConsumed()
}