Files
tidb/pkg/util/memory/tracker.go

1360 lines
40 KiB
Go

// Copyright 2018 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package memory
import (
"bytes"
"fmt"
"runtime"
"slices"
"strconv"
"sync"
"sync/atomic"
"time"
"github.com/pingcap/tidb/pkg/metrics"
"github.com/pingcap/tidb/pkg/util/intest"
"github.com/pingcap/tidb/pkg/util/sqlkiller"
atomicutil "go.uber.org/atomic"
)
// TrackMemWhenExceeds is the threshold when memory usage needs to be tracked.
const TrackMemWhenExceeds = 104857600 // 100MB
// DefMemQuotaQuery is default memory quota for query.
const DefMemQuotaQuery = 1073741824 // 1GB
// Process global variables for memory limit.
var (
ServerMemoryLimitOriginText = atomicutil.NewString("0")
ServerMemoryLimit = atomicutil.NewUint64(0)
ServerMemoryLimitSessMinSize = atomicutil.NewUint64(128 << 20)
QueryForceDisk = atomicutil.NewInt64(0)
TriggerMemoryLimitGC = atomicutil.NewBool(false)
MemoryLimitGCLast = atomicutil.NewTime(time.Time{})
MemoryLimitGCTotal = atomicutil.NewInt64(0)
)
// Tracker is used to track the memory usage during query execution.
// It contains an optional limit and can be arranged into a tree structure
// such that the consumption tracked by a Tracker is also tracked by
// its ancestors. The main idea comes from Apache Impala:
//
// https://github.com/cloudera/Impala/blob/cdh5-trunk/be/src/runtime/mem-tracker.h
//
// By default, memory consumption is tracked via calls to "Consume()", either to
// the tracker itself or to one of its descendents. A typical sequence of calls
// for a single Tracker is:
// 1. tracker.SetLabel() / tracker.SetActionOnExceed() / tracker.AttachTo()
// 2. tracker.Consume() / tracker.ReplaceChild() / tracker.BytesConsumed()
//
// NOTE: We only protect concurrent access to "bytesConsumed" and "children",
// that is to say:
// 1. Only "BytesConsumed()", "Consume()" and "AttachTo()" are thread-safe.
// 2. Other operations of a Tracker tree is not thread-safe.
//
// We have two limits for the memory quota: soft limit and hard limit.
// If the soft limit is exceeded, we will trigger the action that alleviates the
// speed of memory growth. The soft limit is hard-coded as `0.8*hard limit`.
// The actions that could be triggered are: AggSpillDiskAction.
//
// If the hard limit is exceeded, we will trigger the action that immediately
// reduces memory usage. The hard limit is set by the system variable `tidb_mem_query_quota`.
// The actions that could be triggered are: SpillDiskAction, SortAndSpillDiskAction, rateLimitAction,
// PanicOnExceed, globalPanicOnExceed, LogOnExceed.
type Tracker struct {
parent atomic.Pointer[Tracker]
MemArbitrator *memArbitrator
Killer *sqlkiller.SQLKiller
bytesLimit atomic.Pointer[bytesLimits]
actionMuForHardLimit actionMu
actionMuForSoftLimit actionMu
mu struct {
// The children memory trackers. If the Tracker is the Global Tracker, like executor.GlobalDiskUsageTracker,
// we wouldn't maintain its children in order to avoiding mutex contention.
children map[int][]*Tracker
sync.Mutex
}
label int // Label of this "Tracker".
// following fields are used with atomic operations, so make them 64-byte aligned.
bytesReleased int64 // Released bytes.
maxConsumed atomicutil.Int64 // max number of bytes consumed during execution.
SessionID atomicutil.Uint64 // SessionID indicates the sessionID the tracker is bound.
bytesConsumed int64 // Consumed bytes.
IsRootTrackerOfSess bool // IsRootTrackerOfSess indicates whether this tracker is bound for session
isGlobal bool // isGlobal indicates whether this tracker is global tracker
}
type actionMu struct {
actionOnExceed ActionOnExceed
sync.Mutex
}
// EnableGCAwareMemoryTrack is used to turn on/off the GC-aware memory track
var EnableGCAwareMemoryTrack = atomicutil.NewBool(false)
// https://golang.google.cn/pkg/runtime/#SetFinalizer
// It is not guaranteed that a finalizer will run if the size of *obj is zero bytes.
type finalizerRef struct {
_ byte //nolint:unused
}
// softScale means the scale of the soft limit to the hard limit.
const softScale = 0.8
// bytesLimits holds limit config atomically.
type bytesLimits struct {
bytesHardLimit int64 // bytesHardLimit <= 0 means no limit, used for actionMuForHardLimit.
bytesSoftLimit int64 // bytesSoftLimit <= 0 means no limit, used for actionMuForSoftLimit.
}
var unlimitedBytesLimit = bytesLimits{
bytesHardLimit: -1,
bytesSoftLimit: -1,
}
var defaultQueryQuota = bytesLimits{
bytesHardLimit: DefMemQuotaQuery,
bytesSoftLimit: DefMemQuotaQuery * 8 / 10,
}
// MemUsageTop1Tracker record the use memory top1 session's tracker for kill.
var MemUsageTop1Tracker atomic.Pointer[Tracker]
var mockDebugInject func()
// InitTracker initializes a memory tracker.
// 1. "label" is the label used in the usage string.
// 2. "bytesLimit <= 0" means no limit.
//
// For the common tracker, isGlobal is default as false
func InitTracker(t *Tracker, label int, bytesLimit int64, action ActionOnExceed) {
t.mu.children = nil
t.actionMuForHardLimit.actionOnExceed = action
t.actionMuForSoftLimit.actionOnExceed = nil
t.parent.Store(nil)
t.label = label
if bytesLimit <= 0 {
t.bytesLimit.Store(&unlimitedBytesLimit)
} else if bytesLimit == DefMemQuotaQuery {
t.bytesLimit.Store(&defaultQueryQuota)
} else {
t.bytesLimit.Store(&bytesLimits{
bytesHardLimit: bytesLimit,
bytesSoftLimit: int64(float64(bytesLimit) * softScale),
})
}
t.maxConsumed.Store(0)
t.isGlobal = false
t.MemArbitrator = nil
}
// NewTracker creates a memory tracker.
// 1. "label" is the label used in the usage string.
// 2. "bytesLimit <= 0" means no limit.
//
// For the common tracker, isGlobal is default as false
func NewTracker(label int, bytesLimit int64) *Tracker {
t := &Tracker{
label: label,
}
t.bytesLimit.Store(&bytesLimits{
bytesHardLimit: bytesLimit,
bytesSoftLimit: int64(float64(bytesLimit) * softScale),
})
t.actionMuForHardLimit.actionOnExceed = &LogOnExceed{}
t.isGlobal = false
return t
}
// NewGlobalTracker creates a global tracker, its isGlobal is default as true
func NewGlobalTracker(label int, bytesLimit int64) *Tracker {
t := &Tracker{
label: label,
}
t.bytesLimit.Store(&bytesLimits{
bytesHardLimit: bytesLimit,
bytesSoftLimit: int64(float64(bytesLimit) * softScale),
})
t.actionMuForHardLimit.actionOnExceed = &LogOnExceed{}
t.isGlobal = true
return t
}
// CheckBytesLimit check whether the bytes limit of the tracker is equal to a value.
// Only used in test.
func (t *Tracker) CheckBytesLimit(val int64) bool {
return t.bytesLimit.Load().bytesHardLimit == val
}
// SetBytesLimit sets the bytes limit for this tracker.
// "bytesHardLimit <= 0" means no limit.
func (t *Tracker) SetBytesLimit(bytesLimit int64) {
if bytesLimit <= 0 {
t.bytesLimit.Store(&unlimitedBytesLimit)
} else if bytesLimit == DefMemQuotaQuery {
t.bytesLimit.Store(&defaultQueryQuota)
} else {
t.bytesLimit.Store(&bytesLimits{
bytesHardLimit: bytesLimit,
bytesSoftLimit: int64(float64(bytesLimit) * softScale),
})
}
}
// GetBytesLimit gets the bytes limit for this tracker.
// "bytesHardLimit <= 0" means no limit.
func (t *Tracker) GetBytesLimit() int64 {
return t.bytesLimit.Load().bytesHardLimit
}
// CheckExceed checks whether the consumed bytes is exceed for this tracker.
func (t *Tracker) CheckExceed() bool {
bytesHardLimit := t.bytesLimit.Load().bytesHardLimit
return atomic.LoadInt64(&t.bytesConsumed) >= bytesHardLimit && bytesHardLimit > 0
}
// SetActionOnExceed sets the action when memory usage exceeds bytesHardLimit.
func (t *Tracker) SetActionOnExceed(a ActionOnExceed) {
t.actionMuForHardLimit.Lock()
defer t.actionMuForHardLimit.Unlock()
t.actionMuForHardLimit.actionOnExceed = a
}
// FallbackOldAndSetNewAction sets the action when memory usage exceeds bytesHardLimit
// and set the original action as its fallback.
func (t *Tracker) FallbackOldAndSetNewAction(a ActionOnExceed) {
t.actionMuForHardLimit.Lock()
defer t.actionMuForHardLimit.Unlock()
t.actionMuForHardLimit.actionOnExceed = reArrangeFallback(a, t.actionMuForHardLimit.actionOnExceed)
}
// FallbackOldAndSetNewActionForSoftLimit sets the action when memory usage exceeds bytesSoftLimit
// and set the original action as its fallback.
func (t *Tracker) FallbackOldAndSetNewActionForSoftLimit(a ActionOnExceed) {
t.actionMuForSoftLimit.Lock()
defer t.actionMuForSoftLimit.Unlock()
t.actionMuForSoftLimit.actionOnExceed = reArrangeFallback(a, t.actionMuForSoftLimit.actionOnExceed)
}
// GetFallbackForTest get the oom action used by test.
func (t *Tracker) GetFallbackForTest(ignoreFinishedAction bool) ActionOnExceed {
t.actionMuForHardLimit.Lock()
defer t.actionMuForHardLimit.Unlock()
if t.actionMuForHardLimit.actionOnExceed != nil && t.actionMuForHardLimit.actionOnExceed.IsFinished() && ignoreFinishedAction {
t.actionMuForHardLimit.actionOnExceed = t.actionMuForHardLimit.actionOnExceed.GetFallback()
}
return t.actionMuForHardLimit.actionOnExceed
}
// UnbindActions unbinds actionForHardLimit and actionForSoftLimit.
func (t *Tracker) UnbindActions() {
t.actionMuForSoftLimit.Lock()
defer t.actionMuForSoftLimit.Unlock()
t.actionMuForSoftLimit.actionOnExceed = nil
t.actionMuForHardLimit.Lock()
defer t.actionMuForHardLimit.Unlock()
// Currently this method is only called by ResetContextOfStmt, which then always calls SetActionOnExceed to set
// actionForHardLimit.actionOnExceed properly, thus it's safe to set it nil here.
t.actionMuForHardLimit.actionOnExceed = nil
}
// UnbindActionFromHardLimit unbinds action from hardLimit.
func (t *Tracker) UnbindActionFromHardLimit(actionToUnbind ActionOnExceed) {
t.actionMuForHardLimit.Lock()
defer t.actionMuForHardLimit.Unlock()
var prev ActionOnExceed
for current := t.actionMuForHardLimit.actionOnExceed; current != nil; current = current.GetFallback() {
if current == actionToUnbind {
if prev == nil {
// actionToUnbind is the first element
t.actionMuForHardLimit.actionOnExceed = current.GetFallback()
} else {
// actionToUnbind is not the first element
prev.SetFallback(current.GetFallback())
}
break
}
prev = current
}
}
// reArrangeFallback merge two action chains and rearrange them by priority in descending order.
func reArrangeFallback(a ActionOnExceed, b ActionOnExceed) ActionOnExceed {
if a == nil {
return b
}
if b == nil {
return a
}
if a.GetPriority() < b.GetPriority() {
a, b = b, a
}
a.SetFallback(reArrangeFallback(a.GetFallback(), b))
return a
}
// SetLabel sets the label of a Tracker.
func (t *Tracker) SetLabel(label int) {
parent := t.getParent()
t.Detach()
t.label = label
if parent != nil {
t.AttachTo(parent)
}
}
// Label gets the label of a Tracker.
func (t *Tracker) Label() int {
return t.label
}
// AttachTo attaches this memory tracker as a child to another Tracker. If it
// already has a parent, this function will remove it from the old parent.
// Its consumed memory usage is used to update all its ancestors.
func (t *Tracker) AttachTo(parent *Tracker) {
if parent.isGlobal {
t.AttachToGlobalTracker(parent)
return
}
oldParent := t.getParent()
if oldParent != nil {
oldParent.remove(t)
}
parent.mu.Lock()
if parent.mu.children == nil {
parent.mu.children = make(map[int][]*Tracker)
}
parent.mu.children[t.label] = append(parent.mu.children[t.label], t)
parent.mu.Unlock()
t.setParent(parent)
parent.Consume(t.BytesConsumed())
}
// Detach de-attach the tracker child from its parent, then set its parent property as nil
func (t *Tracker) Detach() {
if t == nil {
return
}
t.DetachMemArbitrator()
parent := t.getParent()
if parent == nil {
return
}
if parent.isGlobal {
t.DetachFromGlobalTracker()
return
}
if parent.IsRootTrackerOfSess && t.label != LabelForMemDB {
parent.actionMuForHardLimit.Lock()
parent.actionMuForHardLimit.actionOnExceed = nil
parent.actionMuForHardLimit.Unlock()
parent.actionMuForSoftLimit.Lock()
parent.actionMuForSoftLimit.actionOnExceed = nil
parent.actionMuForSoftLimit.Unlock()
parent.Killer.Reset()
}
parent.remove(t)
t.setParent(nil) //atomic operator
}
func (t *Tracker) remove(oldChild *Tracker) {
found := false
label := oldChild.label
t.mu.Lock()
if t.mu.children != nil {
children := t.mu.children[label]
for i, child := range children {
if child == oldChild {
children = slices.Delete(children, i, i+1)
if len(children) > 0 {
t.mu.children[label] = children
} else {
delete(t.mu.children, label)
}
found = true
break
}
}
}
t.mu.Unlock()
if found {
oldChild.setParent(nil)
t.Consume(-oldChild.BytesConsumed())
}
}
// ReplaceChild removes the old child specified in "oldChild" and add a new
// child specified in "newChild". old child's memory consumption will be
// removed and new child's memory consumption will be added.
func (t *Tracker) ReplaceChild(oldChild, newChild *Tracker) {
if newChild == nil {
t.remove(oldChild)
return
}
if oldChild.label != newChild.label {
t.remove(oldChild)
newChild.AttachTo(t)
return
}
newConsumed := newChild.BytesConsumed()
newChild.setParent(t)
label := oldChild.label
t.mu.Lock()
if t.mu.children != nil {
children := t.mu.children[label]
for i, child := range children {
if child != oldChild {
continue
}
newConsumed -= oldChild.BytesConsumed()
oldChild.setParent(nil)
children[i] = newChild
t.mu.children[label] = children
break
}
}
t.mu.Unlock()
t.Consume(newConsumed)
}
// Consume is used to consume a memory usage. "bytes" can be a negative value,
// which means this is a memory release operation. When memory usage of a tracker
// exceeds its bytesSoftLimit/bytesHardLimit, the tracker calls its action, so does each of its ancestors.
func (t *Tracker) Consume(bs int64) {
if bs == 0 {
return
}
var rootExceed, rootExceedForSoftLimit, sessionRootTracker *Tracker
for tracker := t; tracker != nil; tracker = tracker.getParent() {
if tracker.IsRootTrackerOfSess {
sessionRootTracker = tracker
}
if m := tracker.MemArbitrator; m != nil {
if bs > 0 {
if m.useBigBudget() {
goto useBigBudget
}
{ // fast path for small budget
if m.addSmallBudget(bs) > m.budget.smallLimit {
m.addSmallBudget(-bs)
goto initBigBudget
}
b := m.smallBudget()
if t := m.approxUnixTimeSec(); b.getLastUsedTimeSec() != t {
b.setLastUsedTimeSec(t)
}
if b.Used.Load() > b.approxCapacity() && b.PullFromUpstream() != nil {
goto initBigBudget
}
goto endUseBudget
}
initBigBudget:
m.initBigBudget()
useBigBudget:
if m.addBigBudgetUsed(bs) > m.bigBudgetGrowThreshold() {
m.growBigBudget()
}
endUseBudget: // nop
} else if m.useBigBudget() { // delta <= 0 && use big budget
m.addBigBudgetUsed(bs)
} else { // delta <= 0 && use small budget
m.addSmallBudget(bs)
}
}
bytesConsumed := atomic.AddInt64(&tracker.bytesConsumed, bs)
bytesReleased := atomic.LoadInt64(&tracker.bytesReleased)
limits := tracker.bytesLimit.Load()
if bytesConsumed+bytesReleased >= limits.bytesHardLimit && limits.bytesHardLimit > 0 {
rootExceed = tracker
}
if bytesConsumed+bytesReleased >= limits.bytesSoftLimit && limits.bytesSoftLimit > 0 {
rootExceedForSoftLimit = tracker
}
for {
maxNow := tracker.maxConsumed.Load()
consumed := atomic.LoadInt64(&tracker.bytesConsumed)
if consumed > maxNow && !tracker.maxConsumed.CompareAndSwap(maxNow, consumed) {
continue
}
if tracker.label == LabelForGlobalAnalyzeMemory {
// `LabelForGlobalAnalyzeMemory` represents in-use memory, which should never be negative.
intest.Assert(consumed >= 0, fmt.Sprintf("global analyze memory usage negative: %d", consumed))
}
if label, ok := MetricsTypes[tracker.label]; ok {
metrics.MemoryUsage.WithLabelValues(label[0], label[1]).Set(float64(consumed))
}
break
}
}
tryAction := func(mu *actionMu, tracker *Tracker) {
mu.Lock()
defer mu.Unlock()
for mu.actionOnExceed != nil && mu.actionOnExceed.IsFinished() {
mu.actionOnExceed = mu.actionOnExceed.GetFallback()
}
if mu.actionOnExceed != nil {
mu.actionOnExceed.Action(tracker)
}
}
if bs > 0 && !UsingGlobalMemArbitration() && sessionRootTracker != nil {
// Update the Top1 session
memUsage := sessionRootTracker.BytesConsumed()
limitSessMinSize := ServerMemoryLimitSessMinSize.Load()
if uint64(memUsage) >= limitSessMinSize {
oldTracker := MemUsageTop1Tracker.Load()
for oldTracker.LessThan(sessionRootTracker) {
if MemUsageTop1Tracker.CompareAndSwap(oldTracker, sessionRootTracker) {
break
}
oldTracker = MemUsageTop1Tracker.Load()
}
}
}
if bs > 0 && sessionRootTracker != nil {
err := sessionRootTracker.Killer.HandleSignal()
if err != nil {
panic(err)
}
}
if bs > 0 && rootExceed != nil {
tryAction(&rootExceed.actionMuForHardLimit, rootExceed)
}
if bs > 0 && rootExceedForSoftLimit != nil {
tryAction(&rootExceedForSoftLimit.actionMuForSoftLimit, rootExceedForSoftLimit)
}
}
// HandleKillSignal checks if a kill signal has been sent to the session root tracker.
// If a kill signal is detected, it panics with the error returned by the signal handler.
func (t *Tracker) HandleKillSignal() {
var sessionRootTracker *Tracker
for tracker := t; tracker != nil; tracker = tracker.getParent() {
if tracker.IsRootTrackerOfSess {
sessionRootTracker = tracker
}
}
if sessionRootTracker != nil {
err := sessionRootTracker.Killer.HandleSignal()
if err != nil {
panic(err)
}
}
}
// BufferedConsume is used to buffer memory usage and do late consume
// not thread-safe, should be called in one goroutine
func (t *Tracker) BufferedConsume(bufferedMemSize *int64, bytes int64) {
*bufferedMemSize += bytes
if *bufferedMemSize >= int64(TrackMemWhenExceeds) {
t.Consume(*bufferedMemSize)
*bufferedMemSize = int64(0)
}
}
// Release is used to release memory tracked, track the released memory until GC triggered if needed
// If you want your track to be GC-aware, please use Release(bytes) instead of Consume(-bytes), and pass the memory size of the real object.
// Only Analyze is integrated with Release so far.
func (t *Tracker) Release(bytes int64) {
if bytes == 0 {
return
}
defer t.Consume(-bytes)
for tracker := t; tracker != nil; tracker = tracker.getParent() {
if tracker.shouldRecordRelease() {
// use fake ref instead of obj ref, otherwise obj will be reachable again and gc in next cycle
newRef := &finalizerRef{}
finalizer := func(tracker *Tracker) func(ref *finalizerRef) {
return func(*finalizerRef) {
tracker.release(bytes) // finalizer func is called async
}
}
runtime.SetFinalizer(newRef, finalizer(tracker))
tracker.recordRelease(bytes)
return
}
}
}
// BufferedRelease is used to buffer memory release and do late release
// not thread-safe, should be called in one goroutine
func (t *Tracker) BufferedRelease(bufferedMemSize *int64, bytes int64) {
*bufferedMemSize += bytes
if *bufferedMemSize >= int64(TrackMemWhenExceeds) {
t.Release(*bufferedMemSize)
*bufferedMemSize = int64(0)
}
}
func (t *Tracker) shouldRecordRelease() bool {
return EnableGCAwareMemoryTrack.Load() && t.label == LabelForGlobalAnalyzeMemory
}
func (t *Tracker) recordRelease(bytes int64) {
for tracker := t; tracker != nil; tracker = tracker.getParent() {
bytesReleased := atomic.AddInt64(&tracker.bytesReleased, bytes)
if label, ok := MetricsTypes[tracker.label]; ok {
metrics.MemoryUsage.WithLabelValues(label[0], label[2]).Set(float64(bytesReleased))
}
}
}
func (t *Tracker) release(bytes int64) {
for tracker := t; tracker != nil; tracker = tracker.getParent() {
bytesReleased := atomic.AddInt64(&tracker.bytesReleased, -bytes)
if label, ok := MetricsTypes[tracker.label]; ok {
metrics.MemoryUsage.WithLabelValues(label[0], label[2]).Set(float64(bytesReleased))
}
}
}
// BytesConsumed returns the consumed memory usage value in bytes.
func (t *Tracker) BytesConsumed() int64 {
return atomic.LoadInt64(&t.bytesConsumed)
}
// BytesReleased returns the released memory value in bytes.
func (t *Tracker) BytesReleased() int64 {
return atomic.LoadInt64(&t.bytesReleased)
}
// MaxConsumed returns max number of bytes consumed during execution.
// Note: Don't make this method return -1 for special meanings in the future. Because binary plan has used -1 to
// distinguish between "0 bytes" and "N/A". ref: binaryOpFromFlatOp()
func (t *Tracker) MaxConsumed() int64 {
return t.maxConsumed.Load()
}
// ResetMaxConsumed should be invoked before executing a new statement in a session.
func (t *Tracker) ResetMaxConsumed() {
t.maxConsumed.Store(t.BytesConsumed())
}
// SearchTrackerWithoutLock searches the specific tracker under this tracker without lock.
func (t *Tracker) SearchTrackerWithoutLock(label int) *Tracker {
if t.label == label {
return t
}
children := t.mu.children[label]
if len(children) > 0 {
return children[0]
}
return nil
}
// SearchTrackerConsumedMoreThanNBytes searches the specific tracker that consumes more than NBytes.
func (t *Tracker) SearchTrackerConsumedMoreThanNBytes(limit int64) (res []*Tracker) {
t.mu.Lock()
defer t.mu.Unlock()
for _, childSlice := range t.mu.children {
for _, tracker := range childSlice {
if tracker.BytesConsumed() > limit {
res = append(res, tracker)
}
}
}
return
}
// String returns the string representation of this Tracker tree.
func (t *Tracker) String() string {
buffer := bytes.NewBufferString("\n")
t.toString("", buffer)
return buffer.String()
}
func (t *Tracker) toString(indent string, buffer *bytes.Buffer) {
fmt.Fprintf(buffer, "%s\"%d\"{\n", indent, t.label)
bytesLimit := t.GetBytesLimit()
if bytesLimit > 0 {
fmt.Fprintf(buffer, "%s \"quota\": %s\n", indent, t.FormatBytes(bytesLimit))
}
fmt.Fprintf(buffer, "%s \"consumed\": %s\n", indent, t.FormatBytes(t.BytesConsumed()))
t.mu.Lock()
labels := make([]int, 0, len(t.mu.children))
for label := range t.mu.children {
labels = append(labels, label)
}
slices.Sort(labels)
for _, label := range labels {
children := t.mu.children[label]
for _, child := range children {
child.toString(indent+" ", buffer)
}
}
t.mu.Unlock()
buffer.WriteString(indent + "}\n")
}
// FormatBytes uses to format bytes, this function will prune precision before format bytes.
func (*Tracker) FormatBytes(numBytes int64) string {
return FormatBytes(numBytes)
}
// LessThan indicates whether t byteConsumed is less than t2 byteConsumed.
func (t *Tracker) LessThan(t2 *Tracker) bool {
if t == nil {
return true
}
if t2 == nil {
return false
}
return t.BytesConsumed() < t2.BytesConsumed()
}
// BytesToString converts the memory consumption to a readable string.
func BytesToString(numBytes int64) string {
gb := float64(numBytes) / float64(byteSizeGB)
if gb > 1 {
return fmt.Sprintf("%v GB", gb)
}
mb := float64(numBytes) / float64(byteSizeMB)
if mb > 1 {
return fmt.Sprintf("%v MB", mb)
}
kb := float64(numBytes) / float64(byteSizeKB)
if kb > 1 {
return fmt.Sprintf("%v KB", kb)
}
return fmt.Sprintf("%v Bytes", numBytes)
}
// FormatBytes uses to format bytes, this function will prune precision before format bytes.
func FormatBytes(numBytes int64) string {
if numBytes <= byteSizeKB {
return BytesToString(numBytes)
}
unit, unitStr := getByteUnit(numBytes)
if unit == byteSize {
return BytesToString(numBytes)
}
v := float64(numBytes) / float64(unit)
decimal := 1
if numBytes%unit == 0 {
decimal = 0
} else if v < 10 {
decimal = 2
}
return fmt.Sprintf("%v %s", strconv.FormatFloat(v, 'f', decimal, 64), unitStr)
}
func getByteUnit(b int64) (int64, string) {
if b > byteSizeGB {
return byteSizeGB, "GB"
} else if b > byteSizeMB {
return byteSizeMB, "MB"
} else if b > byteSizeKB {
return byteSizeKB, "KB"
}
return byteSize, "Bytes"
}
// AttachToGlobalTracker attach the tracker to the global tracker
// AttachToGlobalTracker should be called at the initialization for the session executor's tracker
func (t *Tracker) AttachToGlobalTracker(globalTracker *Tracker) {
if globalTracker == nil {
return
}
if !globalTracker.isGlobal {
panic("Attach to a non-GlobalTracker")
}
parent := t.getParent()
if parent != nil {
if parent.isGlobal {
parent.Consume(-t.BytesConsumed())
} else {
parent.remove(t)
}
}
t.setParent(globalTracker)
globalTracker.Consume(t.BytesConsumed())
}
// DetachFromGlobalTracker detach itself from its parent
// Note that only the parent of this tracker is Global Tracker could call this function
// Otherwise it should use Detach
func (t *Tracker) DetachFromGlobalTracker() {
parent := t.getParent()
if parent == nil {
return
}
if !parent.isGlobal {
panic("Detach from a non-GlobalTracker")
}
parent.Consume(-t.BytesConsumed())
t.setParent(nil)
}
// ReplaceBytesUsed replace bytesConsume for the tracker
func (t *Tracker) ReplaceBytesUsed(bytes int64) {
t.Consume(bytes - t.BytesConsumed())
}
// Reset detach the tracker from the old parent and clear the old children. The label and byteLimit would not be reset.
func (t *Tracker) Reset() {
t.Detach()
t.ReplaceBytesUsed(0)
t.mu.children = nil
t.resetMemArbitrator()
}
func (t *Tracker) getParent() *Tracker {
return t.parent.Load()
}
func (t *Tracker) setParent(parent *Tracker) {
t.parent.Store(parent)
}
// CountAllChildrenMemUse return memory used tree for the tracker
func (t *Tracker) CountAllChildrenMemUse() map[string]int64 {
trackerMemUseMap := make(map[string]int64, 1024)
countChildMem(t, "", trackerMemUseMap)
return trackerMemUseMap
}
// GetChildrenForTest returns children trackers
func (t *Tracker) GetChildrenForTest() []*Tracker {
t.mu.Lock()
defer t.mu.Unlock()
trackers := make([]*Tracker, 0)
for _, list := range t.mu.children {
trackers = append(trackers, list...)
}
return trackers
}
func countChildMem(t *Tracker, familyTreeName string, trackerMemUseMap map[string]int64) {
if len(familyTreeName) > 0 {
familyTreeName += " <- "
}
familyTreeName += "[" + strconv.Itoa(t.Label()) + "]"
trackerMemUseMap[familyTreeName] += t.BytesConsumed()
t.mu.Lock()
defer t.mu.Unlock()
for _, sli := range t.mu.children {
for _, tracker := range sli {
countChildMem(tracker, familyTreeName, trackerMemUseMap)
}
}
}
const (
// LabelForSQLText represents the label of the SQL Text
LabelForSQLText int = -1
// LabelForIndexWorker represents the label of the index worker
LabelForIndexWorker int = -2
// LabelForInnerList represents the label of the inner list
LabelForInnerList int = -3
// LabelForInnerTable represents the label of the inner table
LabelForInnerTable int = -4
// LabelForOuterTable represents the label of the outer table
LabelForOuterTable int = -5
// LabelForCoprocessor represents the label of the coprocessor
LabelForCoprocessor int = -6
// LabelForChunkList represents the label of the chunk list
LabelForChunkList int = -7
// LabelForGlobalSimpleLRUCache represents the label of the Global SimpleLRUCache
LabelForGlobalSimpleLRUCache int = -8
// LabelForChunkDataInDiskByRows represents the label of the chunk list in disk
LabelForChunkDataInDiskByRows int = -9
// LabelForRowContainer represents the label of the row container
LabelForRowContainer int = -10
// LabelForGlobalStorage represents the label of the Global Storage
LabelForGlobalStorage int = -11
// LabelForGlobalMemory represents the label of the Global Memory
LabelForGlobalMemory int = -12
// LabelForBuildSideResult represents the label of the BuildSideResult
LabelForBuildSideResult int = -13
// LabelForRowChunks represents the label of the row chunks
LabelForRowChunks int = -14
// LabelForStatsCache represents the label of the stats cache
LabelForStatsCache int = -15
// LabelForOuterList represents the label of the outer list
LabelForOuterList int = -16
// LabelForApplyCache represents the label of the apply cache
LabelForApplyCache int = -17
// LabelForSimpleTask represents the label of the simple task
LabelForSimpleTask int = -18
// LabelForCTEStorage represents the label of CTE storage
LabelForCTEStorage int = -19
// LabelForIndexJoinInnerWorker represents the label of IndexJoin InnerWorker
LabelForIndexJoinInnerWorker int = -20
// LabelForIndexJoinOuterWorker represents the label of IndexJoin OuterWorker
LabelForIndexJoinOuterWorker int = -21
// LabelForBindCache represents the label of the bind cache
LabelForBindCache int = -22
// LabelForNonTransactionalDML represents the label of the non-transactional DML
LabelForNonTransactionalDML = -23
// LabelForAnalyzeMemory represents the label of the memory of each analyze job
LabelForAnalyzeMemory int = -24
// LabelForGlobalAnalyzeMemory represents the label of the global memory of all analyze jobs
LabelForGlobalAnalyzeMemory int = -25
// LabelForPreparedPlanCache represents the label of the prepared plan cache memory usage
LabelForPreparedPlanCache int = -26
// LabelForSession represents the label of a session.
LabelForSession int = -27
// LabelForMemDB represents the label of the MemDB
LabelForMemDB int = -28
// LabelForCursorFetch represents the label of the execution of cursor fetch
LabelForCursorFetch int = -29
// LabelForChunkDataInDiskByChunks represents the label of the chunk list in disk
LabelForChunkDataInDiskByChunks int = -30
// LabelForSortPartition represents the label of the sort partition
LabelForSortPartition int = -31
// LabelForHashTableInHashJoinV2 represents the label of the hash join v2's hash table
LabelForHashTableInHashJoinV2 int = -32
)
// MetricsTypes is used to get label for metrics
// string[0] is LblModule, string[1] is heap-in-use type, string[2] is released type
var MetricsTypes = map[int][]string{
LabelForGlobalAnalyzeMemory: {"analyze", "inuse", "released"},
}
const (
memArbitratorStateSmallBudget int32 = iota // using small budget
memArbitratorStateIntoBigBudget // initializing big budget from small budget
memArbitratorStateBigBudget // using big budget
memArbitratorStateDown // down
)
type memArbitrator struct {
*MemArbitrator
ctx *ArbitrationContext
killer *sqlkiller.SQLKiller
budget struct {
smallB *TrackedConcurrentBudget
mu struct {
bigB ConcurrentBudget // bigB.Used (aks growThreshold): threshold to pull from upstream (95% * bigB.Capacity)
bigUsed atomic.Int64 // bigUsed <= growThreshold <= bigB.Capacity
smallUsed atomic.Int64
_ cpuCacheLinePad
}
smallLimit int64
useBig struct {
sync.Mutex
atomic.Bool
}
}
uid uint64
digestID uint64 // identify the digest profile of root-pool / SQL
reserveSize int64
isInternal bool
state atomic.Int32 // states: the current state of memArbitrator
AwaitAlloc struct {
TotalDur atomic.Int64 // total time spent waiting for memory allocation in nanoseconds
StartUtime int64 // start time of the last allocation attempt in nanoseconds.
Size int64 // size of the last allocation attempt in bytes. 0 means no allocation attempt is in progress.
}
}
func (m *memArbitrator) bigBudget() *ConcurrentBudget {
return &m.budget.mu.bigB
}
func (m *memArbitrator) smallBudget() *TrackedConcurrentBudget {
return m.budget.smallB
}
func (m *memArbitrator) bigBudgetGrowThreshold() int64 {
return m.bigBudget().Used.Load()
}
func (m *memArbitrator) bigBudgetCap() int64 {
return m.bigBudget().approxCapacity()
}
func (m *memArbitrator) bigBudgetUsed() int64 {
return m.budget.mu.bigUsed.Load()
}
func (m *memArbitrator) setBigBudgetGrowThreshold(x int64) {
m.bigBudget().Used.Store(x)
}
func (m *memArbitrator) doSetBigBudgetCap(x int64) {
m.bigBudget().Capacity = x
}
func (m *memArbitrator) addBigBudgetUsed(d int64) int64 {
return m.budget.mu.bigUsed.Add(d)
}
func (m *memArbitrator) smallBudgetUsed() int64 {
return m.budget.mu.smallUsed.Load()
}
func (m *memArbitrator) addSmallBudget(d int64) int64 {
m.smallBudget().HeapInuse.Add(d)
m.smallBudget().Used.Add(d)
return m.budget.mu.smallUsed.Add(d)
}
func (m *memArbitrator) cleanSmallBudget() (res int64) {
res = m.budget.mu.smallUsed.Swap(0)
m.smallBudget().HeapInuse.Add(-res)
m.smallBudget().Used.Add(-res)
return res
}
func (m *memArbitrator) useBigBudget() bool {
return m.budget.useBig.Load()
}
// MemArbitration returns the time cost of memory arbitration in nanoseconds
func (t *Tracker) MemArbitration() time.Duration {
if t == nil {
return 0
}
m := t.MemArbitrator
if m == nil {
return 0
}
return time.Duration(m.AwaitAlloc.TotalDur.Load())
}
// WaitArbitrate returns the start time and size of the last memory allocation attempt.
func (t *Tracker) WaitArbitrate() (ts time.Time, size int64) {
if t == nil {
return
}
m := t.MemArbitrator
if m == nil {
return
}
return time.Unix(0, m.AwaitAlloc.StartUtime), m.AwaitAlloc.Size
}
func (m *memArbitrator) growBigBudget() {
duration := int64(0)
{
upper := m.bigBudget()
upper.Lock()
used, growThreshold, capacity := m.bigBudgetUsed(), m.bigBudgetGrowThreshold(), m.bigBudgetCap()
if used > growThreshold {
// expect next cap := used * 2.718
extra := max(((used*2783)>>10)-capacity, upper.Pool.allocAlignSize)
extra = min(extra, m.poolAllocStats.MaxPoolAllocUnit)
extra = max(extra, used-capacity)
m.AwaitAlloc.StartUtime = time.Now().UnixNano()
m.AwaitAlloc.Size = extra
if err := upper.Pool.allocate(extra); err == nil {
capacity += extra
m.doSetBigBudgetCap(capacity)
m.setBigBudgetGrowThreshold(max(capacity*95/100, used))
}
duration = time.Now().UnixNano() - m.AwaitAlloc.StartUtime
m.AwaitAlloc.StartUtime = 0
m.AwaitAlloc.Size = 0
}
upper.Unlock()
}
if duration > 0 {
m.AwaitAlloc.TotalDur.Add(duration)
metrics.GlobalMemArbitrationDuration.Observe(time.Duration(duration).Seconds())
}
}
func (m *memArbitrator) initBigBudget() {
m.budget.useBig.Lock()
defer m.budget.useBig.Unlock()
if m.useBigBudget() {
return
}
if smallUsed := m.smallBudgetUsed(); smallUsed > 0 {
m.addBigBudgetUsed(smallUsed)
defer m.cleanSmallBudget()
}
root, err := m.EmplaceRootPool(m.uid)
if err != nil {
panic(err)
}
if m.isInternal {
globalArbitrator.metrics.pools.internalSession.Add(1)
}
if !root.Restart(m.ctx) || !m.state.CompareAndSwap(memArbitratorStateSmallBudget, memArbitratorStateIntoBigBudget) {
panic("failed to init mem pool")
}
{
globalArbitrator.metrics.pools.small.Add(-1)
globalArbitrator.metrics.pools.intoBig.Add(1)
}
if intest.InTest {
if mockDebugInject != nil {
mockDebugInject()
}
}
m.bigBudget().Pool = root.entry.pool
if m.reserveSize > 0 {
m.reserveBigBudget(m.reserveSize)
metrics.GlobalMemArbitratorSubEvents.PoolInitReserve.Inc()
} else if m.ctx.PrevMaxMem > 0 {
metrics.GlobalMemArbitratorSubEvents.PoolInitHitDigest.Inc()
m.reserveBigBudget(m.ctx.PrevMaxMem)
} else if m.bigBudgetUsed() > m.poolAllocStats.SmallPoolLimit {
if initCap := m.SuggestPoolInitCap(); initCap != 0 {
m.reserveBigBudget(initCap)
metrics.GlobalMemArbitratorSubEvents.PoolInitMediumQuota.Inc()
}
}
if m.bigBudgetCap() == 0 {
metrics.GlobalMemArbitratorSubEvents.PoolInitNone.Inc()
}
m.budget.useBig.Store(true)
if intest.InTest {
if mockDebugInject != nil {
mockDebugInject()
}
}
if m.state.CompareAndSwap(memArbitratorStateIntoBigBudget, memArbitratorStateBigBudget) {
globalArbitrator.metrics.pools.intoBig.Add(-1)
globalArbitrator.metrics.pools.big.Add(1)
}
}
func (m *memArbitrator) reserveBigBudget(newCap int64) {
duration := int64(0)
{
upper := m.bigBudget()
upper.Lock()
capacity := m.bigBudgetCap()
extra := max(newCap*1053/1000, m.bigBudgetGrowThreshold(), capacity, m.bigBudgetUsed()) - capacity
m.AwaitAlloc.StartUtime = time.Now().UnixNano()
m.AwaitAlloc.Size = extra
if err := upper.Pool.allocate(extra); err == nil {
capacity += extra
m.doSetBigBudgetCap(capacity)
m.setBigBudgetGrowThreshold(capacity * 95 / 100)
}
duration = time.Now().UnixNano() - m.AwaitAlloc.StartUtime
m.AwaitAlloc.StartUtime = 0
m.AwaitAlloc.Size = 0
upper.Unlock()
}
if duration > 0 {
m.AwaitAlloc.TotalDur.Add(duration)
metrics.GlobalMemArbitrationDuration.Observe(time.Duration(duration).Seconds())
}
}
func (t *Tracker) resetMemArbitrator() {
t.MemArbitrator = nil
}
// DetachMemArbitrator detaches the mem arbitrator from the tracker and cleans up related resources.
func (t *Tracker) DetachMemArbitrator() bool {
m := t.MemArbitrator
if m == nil {
return false
}
if m.smallBudgetUsed() != 0 {
m.cleanSmallBudget()
}
if m.state.Load() == memArbitratorStateDown {
return false
}
switch m.state.Swap(memArbitratorStateDown) {
case memArbitratorStateSmallBudget:
globalArbitrator.metrics.pools.small.Add(-1)
case memArbitratorStateIntoBigBudget:
{
m.budget.useBig.Lock() // wait for initBigBudget to finish
globalArbitrator.metrics.pools.intoBig.Add(-1)
m.budget.useBig.Unlock()
}
case memArbitratorStateBigBudget:
globalArbitrator.metrics.pools.big.Add(-1)
default:
return false
}
if m.isInternal {
globalArbitrator.metrics.pools.internal.Add(-1)
}
killed := false
if m.killer != nil {
killed = m.killer.Signal != 0
}
maxConsumed := t.maxConsumed.Load()
if !killed {
m.UpdateDigestProfileCache(m.digestID, maxConsumed, m.approxUnixTimeSec())
}
if m.useBigBudget() {
m.bigBudget().Stop()
m.ResetRootPoolByID(m.uid, maxConsumed, !killed)
}
return true
}
// InitMemArbitratorForTest is a simplified version of InitMemArbitrator for test usage.
func (t *Tracker) InitMemArbitratorForTest() bool {
return t.InitMemArbitrator(GlobalMemArbitrator(), 0, nil, "", ArbitrationPriorityMedium, false, 0, false)
}
// InitMemArbitrator attaches (not thread-safe) to the mem arbitrator and initializes the context
// "m" is the mem-arbitrator.
// "memQuotaQuery" is the maximum memory quota for query.
// "killer" is the sql killer.
// "digestKey" is the digest key.
// "memPriority" is the memory priority for arbitration.
// "waitAverse" represents the wait averse property.
// "explicitReserveSize" is the explicit mem quota size to be reserved.
// "isInternal" indicates whether the tracker is for internal session.
func (t *Tracker) InitMemArbitrator(
g *MemArbitrator,
memQuotaQuery int64,
killer *sqlkiller.SQLKiller,
digestKey string,
memPriority ArbitrationPriority,
waitAverse bool,
explicitReserveSize int64,
isInternal bool,
) bool {
if g == nil || t == nil || t.MemArbitrator != nil {
return false
}
uid := t.SessionID.Load()
digestID := HashStr(digestKey)
prevMaxMem := int64(0)
if explicitReserveSize == 0 && len(digestKey) > 0 {
if maxMem, found := g.GetDigestProfileCache(digestID, g.approxUnixTimeSec()); found {
prevMaxMem = maxMem
}
}
var cancelChan <-chan struct{}
if killer != nil {
cancelChan = killer.GetKillEventChan()
}
ctx := NewArbitrationContext(
cancelChan,
prevMaxMem,
memQuotaQuery,
&trackerArbitrateHelper{
tracker: t,
},
memPriority,
waitAverse,
true,
)
m := &memArbitrator{
MemArbitrator: g,
uid: uid,
killer: killer,
digestID: digestID,
reserveSize: explicitReserveSize,
ctx: ctx,
isInternal: isInternal,
}
t.MemArbitrator = m
globalArbitrator.metrics.pools.small.Add(1)
if m.isInternal {
globalArbitrator.metrics.pools.internal.Add(1)
}
if explicitReserveSize > 0 || prevMaxMem > g.poolAllocStats.SmallPoolLimit {
m.initBigBudget()
} else {
m.budget.smallB = g.GetAwaitFreeBudgets(uid)
m.budget.smallLimit = g.poolAllocStats.SmallPoolLimit
}
return true
}
type trackerArbitrateHelper struct {
tracker *Tracker
killed atomic.Bool
}
func (h *trackerArbitrateHelper) Finish() {
t := h.tracker
t.DetachMemArbitrator()
if t.MemArbitrator.isInternal {
globalArbitrator.metrics.pools.internalSession.Add(-1)
}
}
func (h *trackerArbitrateHelper) Stop(reason ArbitratorStopReason) bool {
if h.killed.Load() || h.killed.Swap(true) {
return false
}
for tracker := h.tracker; tracker != nil; tracker = tracker.getParent() {
if tracker.IsRootTrackerOfSess && tracker.Killer != nil {
tracker.Killer.SendKillSignalWithKillEventReason(sqlkiller.KilledByMemArbitrator, reason.String())
break
}
}
return true
}
func (h *trackerArbitrateHelper) HeapInuse() int64 {
return h.tracker.BytesConsumed()
}