Files
tidb/pkg/disttask/framework/scheduler/balancer.go

232 lines
7.5 KiB
Go

// Copyright 2023 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package scheduler
import (
"context"
"time"
"github.com/pingcap/errors"
"github.com/pingcap/log"
"github.com/pingcap/tidb/pkg/disttask/framework/proto"
llog "github.com/pingcap/tidb/pkg/lightning/log"
"github.com/pingcap/tidb/pkg/util/intest"
"go.uber.org/zap"
)
var (
// balanceCheckInterval is the interval to check whether we need to balance the subtasks.
balanceCheckInterval = 3 * CheckTaskFinishedInterval
)
// balancer is used to balance subtasks on managed nodes
// it handles 2 cases:
// - managed node scale in/out.
// - nodes might run subtasks in different speed, the amount of data processed by subtasks varies, cause the subtasks are not balanced.
//
// we will try balance in task order, subtasks will be scheduled to the node with
// enough slots to run them, if there is no such node, we will skip balance for
// the task and try next one.
type balancer struct {
Param
logger *zap.Logger
// a helper temporary map to record the used slots of each node during balance
// to avoid passing it around.
currUsedSlots map[string]int
}
func newBalancer(param Param) *balancer {
logger := log.L()
if intest.InTest {
logger = log.L().With(zap.String("server-id", param.serverID))
}
return &balancer{
Param: param,
logger: logger,
currUsedSlots: make(map[string]int),
}
}
func (b *balancer) balanceLoop(ctx context.Context, sm *Manager) {
for {
select {
case <-ctx.Done():
return
case <-time.After(balanceCheckInterval):
}
b.balance(ctx, sm)
}
}
func (b *balancer) balance(ctx context.Context, sm *Manager) {
// we will use currUsedSlots to calculate adjusted eligible nodes during balance,
// it's initial value depends on the managed nodes, to have a consistent view,
// DO NOT call getManagedNodes twice during 1 balance.
managedNodes := b.nodeMgr.getNodes()
b.currUsedSlots = make(map[string]int, len(managedNodes))
for _, n := range managedNodes {
b.currUsedSlots[n.ID] = 0
}
schedulers := sm.getSchedulers()
for _, sch := range schedulers {
nodeIDs := filterByScope(managedNodes, sch.GetTask().TargetScope)
if err := b.balanceSubtasks(ctx, sch, nodeIDs); err != nil {
b.logger.Warn("failed to balance subtasks",
zap.Int64("task-id", sch.GetTask().ID), llog.ShortError(err))
return
}
}
}
func (b *balancer) balanceSubtasks(ctx context.Context, sch Scheduler, managedNodes []string) error {
task := sch.GetTask()
eligibleNodes, err := getEligibleNodes(ctx, sch, managedNodes)
if err != nil {
return err
}
if len(eligibleNodes) == 0 {
return errors.New("no eligible nodes to balance subtasks")
}
return b.doBalanceSubtasks(ctx, task.ID, eligibleNodes)
}
func (b *balancer) doBalanceSubtasks(ctx context.Context, taskID int64, eligibleNodes []string) (err error) {
subtasks, err := b.taskMgr.GetActiveSubtasks(ctx, taskID)
if err != nil {
return err
}
if len(subtasks) == 0 {
return nil
}
// balance subtasks only to nodes with enough slots, from the view of all
// managed nodes, subtasks of task might not be balanced.
adjustedNodes := filterNodesWithEnoughSlots(b.currUsedSlots, b.slotMgr.getCapacity(),
eligibleNodes, subtasks[0].Concurrency)
if len(adjustedNodes) == 0 {
// no node has enough slots to run the subtasks, skip balance and skip
// update used slots.
return nil
}
adjustedNodeMap := make(map[string]struct{}, len(adjustedNodes))
for _, n := range adjustedNodes {
adjustedNodeMap[n] = struct{}{}
}
defer func() {
if err == nil {
b.updateUsedNodes(subtasks)
}
}()
averageSubtaskCnt := len(subtasks) / len(adjustedNodes)
averageSubtaskRemainder := len(subtasks) - averageSubtaskCnt*len(adjustedNodes)
executorSubtasks := make(map[string][]*proto.SubtaskBase, len(adjustedNodes))
executorPendingCnts := make(map[string]int, len(adjustedNodes))
for _, node := range adjustedNodes {
executorSubtasks[node] = make([]*proto.SubtaskBase, 0, averageSubtaskCnt+1)
}
for _, subtask := range subtasks {
// put running subtask in the front of slice.
// if subtask fail-over, it's possible that there are multiple running
// subtasks for one task executor.
if subtask.State == proto.SubtaskStateRunning {
executorSubtasks[subtask.ExecID] = append([]*proto.SubtaskBase{subtask}, executorSubtasks[subtask.ExecID]...)
} else {
executorSubtasks[subtask.ExecID] = append(executorSubtasks[subtask.ExecID], subtask)
executorPendingCnts[subtask.ExecID]++
}
}
subtasksNeedSchedule := make([]*proto.SubtaskBase, 0)
remainder := averageSubtaskRemainder
executorWithOneMoreSubtask := make(map[string]struct{}, remainder)
for node, sts := range executorSubtasks {
if _, ok := adjustedNodeMap[node]; !ok {
b.logger.Info("dead node or not have enough slots, schedule subtasks away",
zap.Int64("task-id", taskID),
zap.String("node", node),
zap.Int("slot-capacity", b.slotMgr.getCapacity()),
zap.Int("used-slots", b.currUsedSlots[node]))
// dead node or not have enough slots
subtasksNeedSchedule = append(subtasksNeedSchedule, sts...)
delete(executorSubtasks, node)
continue
}
if remainder > 0 {
// first remainder nodes will get 1 more subtask.
if len(sts) >= averageSubtaskCnt+1 {
needScheduleCnt := len(sts) - (averageSubtaskCnt + 1)
// running subtasks are never balanced.
needScheduleCnt = min(executorPendingCnts[node], needScheduleCnt)
subtasksNeedSchedule = append(subtasksNeedSchedule, sts[len(sts)-needScheduleCnt:]...)
executorSubtasks[node] = sts[:len(sts)-needScheduleCnt]
executorWithOneMoreSubtask[node] = struct{}{}
remainder--
}
} else if len(sts) > averageSubtaskCnt {
// running subtasks are never balanced.
cnt := min(executorPendingCnts[node], len(sts)-averageSubtaskCnt)
subtasksNeedSchedule = append(subtasksNeedSchedule, sts[len(sts)-cnt:]...)
executorSubtasks[node] = sts[:len(sts)-cnt]
}
}
if len(subtasksNeedSchedule) == 0 {
return nil
}
for i := 0; i < len(adjustedNodes) && remainder > 0; i++ {
if _, ok := executorWithOneMoreSubtask[adjustedNodes[i]]; !ok {
executorWithOneMoreSubtask[adjustedNodes[i]] = struct{}{}
remainder--
}
}
fillIdx := 0
for _, node := range adjustedNodes {
sts := executorSubtasks[node]
targetSubtaskCnt := averageSubtaskCnt
if _, ok := executorWithOneMoreSubtask[node]; ok {
targetSubtaskCnt = averageSubtaskCnt + 1
}
for i := len(sts); i < targetSubtaskCnt && fillIdx < len(subtasksNeedSchedule); i++ {
subtasksNeedSchedule[fillIdx].ExecID = node
fillIdx++
}
}
if err = b.taskMgr.UpdateSubtasksExecIDs(ctx, subtasksNeedSchedule); err != nil {
return err
}
b.logger.Info("balance subtasks", zap.Stringers("subtasks", subtasksNeedSchedule))
return nil
}
func (b *balancer) updateUsedNodes(subtasks []*proto.SubtaskBase) {
used := make(map[string]int, len(b.currUsedSlots))
// see slotManager.alloc in task executor.
for _, st := range subtasks {
if _, ok := used[st.ExecID]; !ok {
used[st.ExecID] = st.Concurrency
}
}
for node, slots := range used {
b.currUsedSlots[node] += slots
}
}