229 lines
7.3 KiB
Go
229 lines
7.3 KiB
Go
// Copyright 2023 PingCAP, Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package scheduler
|
|
|
|
import (
|
|
"context"
|
|
"slices"
|
|
"sync"
|
|
"sync/atomic"
|
|
|
|
"github.com/pingcap/tidb/pkg/disttask/framework/proto"
|
|
"github.com/pingcap/tidb/pkg/util/cpu"
|
|
"github.com/pingcap/tidb/pkg/util/logutil"
|
|
"go.uber.org/zap"
|
|
)
|
|
|
|
type taskStripes struct {
|
|
task *proto.TaskBase
|
|
stripes int
|
|
}
|
|
|
|
// SlotManager is used to manage the resource slots and stripes.
|
|
//
|
|
// Slot is the resource unit of dist framework on each node, each slot represents
|
|
// 1 cpu core, 1/total-core of memory, 1/total-core of disk, etc.
|
|
//
|
|
// Stripe is the resource unit of dist framework, regardless of the node, each
|
|
// stripe means 1 slot on all nodes managed by dist framework.
|
|
// Number of stripes is equal to number of slots on each node, as we assume that
|
|
// all nodes managed by dist framework are isomorphic.
|
|
// Stripes reserved for a task defines the maximum resource that a task can use
|
|
// but the task might not use all the resources. To maximize the resource utilization,
|
|
// we will try to schedule as many tasks as possible depends on the used slots
|
|
// on each node and the minimum resource required by the tasks, and in this case,
|
|
// we don't consider task order.
|
|
//
|
|
// Dist framework will try to allocate resource by slots and stripes, and give
|
|
// quota to subtask, but subtask can determine what to conform.
|
|
type SlotManager struct {
|
|
// Capacity is the total number of slots and stripes.
|
|
capacity atomic.Int32
|
|
|
|
mu sync.RWMutex
|
|
// represents the number of stripes reserved by task, when we reserve by the
|
|
// minimum resource required by the task, we still append into it, so it summed
|
|
// value might be larger than capacity
|
|
// this slice is in task order.
|
|
reservedStripes []taskStripes
|
|
// map of reservedStripes for fast delete
|
|
task2Index map[int64]int
|
|
// represents the number of slots reserved by task on each node, the execID
|
|
// is only used for reserve minimum resource when starting scheduler, the
|
|
// subtasks may or may not be scheduled on this node.
|
|
reservedSlots map[string]int
|
|
// represents the number of slots taken by task on each node
|
|
// on some cases it might be larger than capacity:
|
|
// current step of higher rank task A has little subtasks, so we start
|
|
// to schedule lower rank task, but next step of A has many subtasks.
|
|
// once initialized, the length of usedSlots should be equal to number of nodes
|
|
// managed by dist framework.
|
|
usedSlots atomic.Pointer[map[string]int]
|
|
}
|
|
|
|
// newSlotManager creates a new SlotManager.
|
|
func newSlotManager() *SlotManager {
|
|
usedSlots := make(map[string]int)
|
|
s := &SlotManager{
|
|
task2Index: make(map[int64]int),
|
|
reservedSlots: make(map[string]int),
|
|
}
|
|
s.usedSlots.Store(&usedSlots)
|
|
// this node might not be the managed node of the framework, but we initialize
|
|
// capacity with the cpu count of this node, it will be updated when node
|
|
// manager starts.
|
|
s.updateCapacity(cpu.GetCPUCount())
|
|
return s
|
|
}
|
|
|
|
// Update updates the used slots on each node.
|
|
// TODO: on concurrent call, update once.
|
|
func (sm *SlotManager) update(ctx context.Context, nodeMgr *NodeManager, taskMgr TaskManager) error {
|
|
nodes := nodeMgr.getNodes()
|
|
slotsOnNodes, err := taskMgr.GetUsedSlotsOnNodes(ctx)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
newUsedSlots := make(map[string]int, len(nodes))
|
|
for _, node := range nodes {
|
|
newUsedSlots[node.ID] = slotsOnNodes[node.ID]
|
|
}
|
|
|
|
sm.usedSlots.Store(&newUsedSlots)
|
|
return nil
|
|
}
|
|
|
|
// CanReserve checks whether there are enough resources for a task.
|
|
// If the resource is reserved by slots, it returns the execID of the task.
|
|
// else if the resource is reserved by stripes, it returns "".
|
|
// as usedSlots is updated asynchronously, it might return false even if there
|
|
// are enough resources, or return true on resource shortage when some task
|
|
// scheduled subtasks.
|
|
func (sm *SlotManager) canReserve(task *proto.TaskBase) (execID string, ok bool) {
|
|
usedSlots := *sm.usedSlots.Load()
|
|
capacity := int(sm.capacity.Load())
|
|
sm.mu.RLock()
|
|
defer sm.mu.RUnlock()
|
|
if len(usedSlots) == 0 {
|
|
// no node managed by dist framework
|
|
return "", false
|
|
}
|
|
|
|
reservedForHigherRank := 0
|
|
for _, s := range sm.reservedStripes {
|
|
if s.task.Compare(task) >= 0 {
|
|
break
|
|
}
|
|
reservedForHigherRank += s.stripes
|
|
}
|
|
if task.Concurrency+reservedForHigherRank <= capacity {
|
|
return "", true
|
|
}
|
|
|
|
for id, count := range usedSlots {
|
|
if count+sm.reservedSlots[id]+task.Concurrency <= capacity {
|
|
return id, true
|
|
}
|
|
}
|
|
return "", false
|
|
}
|
|
|
|
// Reserve reserves resources for a task.
|
|
// Reserve and UnReserve should be called in pair with same parameters.
|
|
func (sm *SlotManager) reserve(task *proto.TaskBase, execID string) {
|
|
taskClone := *task
|
|
|
|
sm.mu.Lock()
|
|
defer sm.mu.Unlock()
|
|
sm.reservedStripes = append(sm.reservedStripes, taskStripes{&taskClone, taskClone.Concurrency})
|
|
slices.SortFunc(sm.reservedStripes, func(a, b taskStripes) int {
|
|
return a.task.Compare(b.task)
|
|
})
|
|
for i, s := range sm.reservedStripes {
|
|
sm.task2Index[s.task.ID] = i
|
|
}
|
|
|
|
if execID != "" {
|
|
sm.reservedSlots[execID] += taskClone.Concurrency
|
|
}
|
|
}
|
|
|
|
// UnReserve un-reserve resources for a task.
|
|
func (sm *SlotManager) unReserve(task *proto.TaskBase, execID string) {
|
|
sm.mu.Lock()
|
|
defer sm.mu.Unlock()
|
|
idx, ok := sm.task2Index[task.ID]
|
|
if !ok {
|
|
return
|
|
}
|
|
sm.reservedStripes = append(sm.reservedStripes[:idx], sm.reservedStripes[idx+1:]...)
|
|
delete(sm.task2Index, task.ID)
|
|
for i, s := range sm.reservedStripes {
|
|
sm.task2Index[s.task.ID] = i
|
|
}
|
|
|
|
if execID != "" {
|
|
sm.reservedSlots[execID] -= task.Concurrency
|
|
if sm.reservedSlots[execID] == 0 {
|
|
delete(sm.reservedSlots, execID)
|
|
}
|
|
}
|
|
}
|
|
|
|
func (sm *SlotManager) getCapacity() int {
|
|
return int(sm.capacity.Load())
|
|
}
|
|
|
|
// we schedule subtasks to the nodes with enough slots first, if no such nodes,
|
|
// schedule to all nodes.
|
|
func (sm *SlotManager) adjustEligibleNodes(eligibleNodes []string, concurrency int) []string {
|
|
usedSlots := *sm.usedSlots.Load()
|
|
nodes := filterNodesWithEnoughSlots(usedSlots, sm.getCapacity(), eligibleNodes, concurrency)
|
|
if len(nodes) == 0 {
|
|
nodes = eligibleNodes
|
|
}
|
|
return nodes
|
|
}
|
|
|
|
func (sm *SlotManager) updateCapacity(cpuCount int) {
|
|
old := sm.capacity.Load()
|
|
if cpuCount > 0 && cpuCount != int(old) {
|
|
sm.capacity.Store(int32(cpuCount))
|
|
if old == 0 {
|
|
logutil.BgLogger().Info("initialize slot capacity", zap.Int("capacity", cpuCount))
|
|
} else {
|
|
logutil.BgLogger().Info("update slot capacity",
|
|
zap.Int("old", int(old)), zap.Int("new", cpuCount))
|
|
}
|
|
}
|
|
}
|
|
|
|
func filterNodesWithEnoughSlots(usedSlots map[string]int, capacity int, eligibleNodes []string, concurrency int) []string {
|
|
nodesOfEnoughSlots := make(map[string]struct{}, len(usedSlots))
|
|
for node, slots := range usedSlots {
|
|
if slots+concurrency <= capacity {
|
|
nodesOfEnoughSlots[node] = struct{}{}
|
|
}
|
|
}
|
|
|
|
result := make([]string, 0, len(eligibleNodes))
|
|
for _, node := range eligibleNodes {
|
|
if _, ok := nodesOfEnoughSlots[node]; ok {
|
|
result = append(result, node)
|
|
}
|
|
}
|
|
return result
|
|
}
|