Files
tidb/pkg/disttask/framework/scheduler/scheduler_manager.go

486 lines
13 KiB
Go

// Copyright 2023 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package scheduler
import (
"context"
"slices"
"time"
"github.com/pingcap/errors"
"github.com/pingcap/failpoint"
"github.com/pingcap/log"
"github.com/pingcap/tidb/pkg/disttask/framework/handle"
"github.com/pingcap/tidb/pkg/disttask/framework/proto"
"github.com/pingcap/tidb/pkg/metrics"
tidbutil "github.com/pingcap/tidb/pkg/util"
"github.com/pingcap/tidb/pkg/util/intest"
"github.com/pingcap/tidb/pkg/util/syncutil"
"go.uber.org/zap"
)
var (
// CheckTaskRunningInterval is the interval for loading tasks.
// It is exported for testing.
CheckTaskRunningInterval = 3 * time.Second
// defaultHistorySubtaskTableGcInterval is the interval of gc history subtask table.
defaultHistorySubtaskTableGcInterval = 24 * time.Hour
// DefaultCleanUpInterval is the interval of cleanup routine.
DefaultCleanUpInterval = 10 * time.Minute
defaultCollectMetricsInterval = 5 * time.Second
)
// WaitTaskFinished is used to sync the test.
var WaitTaskFinished = make(chan struct{})
func (sm *Manager) getSchedulerCount() int {
sm.mu.RLock()
defer sm.mu.RUnlock()
return len(sm.mu.schedulerMap)
}
func (sm *Manager) addScheduler(taskID int64, scheduler Scheduler) {
sm.mu.Lock()
defer sm.mu.Unlock()
sm.mu.schedulerMap[taskID] = scheduler
sm.mu.schedulers = append(sm.mu.schedulers, scheduler)
slices.SortFunc(sm.mu.schedulers, func(i, j Scheduler) int {
return i.GetTask().CompareTask(j.GetTask())
})
}
func (sm *Manager) hasScheduler(taskID int64) bool {
sm.mu.Lock()
defer sm.mu.Unlock()
_, ok := sm.mu.schedulerMap[taskID]
return ok
}
func (sm *Manager) delScheduler(taskID int64) {
sm.mu.Lock()
defer sm.mu.Unlock()
delete(sm.mu.schedulerMap, taskID)
for i, scheduler := range sm.mu.schedulers {
if scheduler.GetTask().ID == taskID {
sm.mu.schedulers = append(sm.mu.schedulers[:i], sm.mu.schedulers[i+1:]...)
break
}
}
}
func (sm *Manager) clearSchedulers() {
sm.mu.Lock()
defer sm.mu.Unlock()
sm.mu.schedulerMap = make(map[int64]Scheduler)
sm.mu.schedulers = sm.mu.schedulers[:0]
}
// getSchedulers returns a copy of schedulers.
func (sm *Manager) getSchedulers() []Scheduler {
sm.mu.RLock()
defer sm.mu.RUnlock()
res := make([]Scheduler, len(sm.mu.schedulers))
copy(res, sm.mu.schedulers)
return res
}
// Manager manage a bunch of schedulers.
// Scheduler schedule and monitor tasks.
// The scheduling task number is limited by size of gPool.
type Manager struct {
ctx context.Context
cancel context.CancelFunc
taskMgr TaskManager
wg tidbutil.WaitGroupWrapper
schedulerWG tidbutil.WaitGroupWrapper
slotMgr *SlotManager
nodeMgr *NodeManager
balancer *balancer
initialized bool
// serverID, it's value is ip:port now.
serverID string
logger *zap.Logger
finishCh chan struct{}
mu struct {
syncutil.RWMutex
schedulerMap map[int64]Scheduler
// in task order
schedulers []Scheduler
}
}
// NewManager creates a scheduler struct.
func NewManager(ctx context.Context, taskMgr TaskManager, serverID string) *Manager {
logger := log.L()
if intest.InTest {
logger = log.L().With(zap.String("server-id", serverID))
}
subCtx, cancel := context.WithCancel(ctx)
slotMgr := newSlotManager()
nodeMgr := newNodeManager(serverID)
schedulerManager := &Manager{
ctx: subCtx,
cancel: cancel,
taskMgr: taskMgr,
serverID: serverID,
slotMgr: slotMgr,
nodeMgr: nodeMgr,
balancer: newBalancer(Param{
taskMgr: taskMgr,
nodeMgr: nodeMgr,
slotMgr: slotMgr,
serverID: serverID,
}),
logger: logger,
finishCh: make(chan struct{}, proto.MaxConcurrentTask),
}
schedulerManager.mu.schedulerMap = make(map[int64]Scheduler)
return schedulerManager
}
// Start the schedulerManager, start the scheduleTaskLoop to start multiple schedulers.
func (sm *Manager) Start() {
// init cached managed nodes
sm.nodeMgr.refreshNodes(sm.ctx, sm.taskMgr, sm.slotMgr)
sm.wg.Run(sm.scheduleTaskLoop)
sm.wg.Run(sm.gcSubtaskHistoryTableLoop)
sm.wg.Run(sm.cleanupTaskLoop)
sm.wg.Run(sm.collectLoop)
sm.wg.Run(func() {
sm.nodeMgr.maintainLiveNodesLoop(sm.ctx, sm.taskMgr)
})
sm.wg.Run(func() {
sm.nodeMgr.refreshNodesLoop(sm.ctx, sm.taskMgr, sm.slotMgr)
})
sm.wg.Run(func() {
sm.balancer.balanceLoop(sm.ctx, sm)
})
sm.initialized = true
}
// Cancel cancels the scheduler manager.
// used in test to simulate tidb node shutdown.
func (sm *Manager) Cancel() {
sm.cancel()
}
// Stop the schedulerManager.
func (sm *Manager) Stop() {
sm.cancel()
sm.schedulerWG.Wait()
sm.wg.Wait()
sm.clearSchedulers()
sm.initialized = false
close(sm.finishCh)
}
// Initialized check the manager initialized.
func (sm *Manager) Initialized() bool {
return sm.initialized
}
// scheduleTaskLoop schedules the tasks.
func (sm *Manager) scheduleTaskLoop() {
sm.logger.Info("schedule task loop start")
ticker := time.NewTicker(CheckTaskRunningInterval)
defer ticker.Stop()
for {
select {
case <-sm.ctx.Done():
sm.logger.Info("schedule task loop exits")
return
case <-ticker.C:
case <-handle.TaskChangedCh:
}
taskCnt := sm.getSchedulerCount()
if taskCnt >= proto.MaxConcurrentTask {
sm.logger.Debug("scheduled tasks reached limit",
zap.Int("current", taskCnt), zap.Int("max", proto.MaxConcurrentTask))
continue
}
schedulableTasks, err := sm.getSchedulableTasks()
if err != nil {
continue
}
err = sm.startSchedulers(schedulableTasks)
if err != nil {
continue
}
}
}
func (sm *Manager) getSchedulableTasks() ([]*proto.TaskBase, error) {
tasks, err := sm.taskMgr.GetTopUnfinishedTasks(sm.ctx)
if err != nil {
sm.logger.Warn("get unfinished tasks failed", zap.Error(err))
return nil, err
}
schedulableTasks := make([]*proto.TaskBase, 0, len(tasks))
for _, task := range tasks {
if sm.hasScheduler(task.ID) {
continue
}
// we check it before start scheduler, so no need to check it again.
// see startScheduler.
// this should not happen normally, unless user modify system table
// directly.
if getSchedulerFactory(task.Type) == nil {
sm.logger.Warn("unknown task type", zap.Int64("task-id", task.ID),
zap.Stringer("task-type", task.Type))
sm.failTask(task.ID, task.State, errors.New("unknown task type"))
continue
}
schedulableTasks = append(schedulableTasks, task)
}
return schedulableTasks, nil
}
func (sm *Manager) startSchedulers(schedulableTasks []*proto.TaskBase) error {
if len(schedulableTasks) == 0 {
return nil
}
if err := sm.slotMgr.update(sm.ctx, sm.nodeMgr, sm.taskMgr); err != nil {
sm.logger.Warn("update used slot failed", zap.Error(err))
return err
}
for _, task := range schedulableTasks {
taskCnt := sm.getSchedulerCount()
if taskCnt >= proto.MaxConcurrentTask {
break
}
var reservedExecID string
allocateSlots := true
var ok bool
switch task.State {
case proto.TaskStatePending, proto.TaskStateRunning, proto.TaskStateResuming:
reservedExecID, ok = sm.slotMgr.canReserve(task)
if !ok {
// task of lower rank might be able to be scheduled.
continue
}
// reverting/cancelling/pausing
default:
allocateSlots = false
sm.logger.Info("start scheduler without allocating slots",
zap.Int64("task-id", task.ID), zap.Stringer("state", task.State))
}
metrics.DistTaskGauge.WithLabelValues(task.Type.String(), metrics.SchedulingStatus).Inc()
metrics.UpdateMetricsForScheduleTask(task.ID, task.Type)
sm.startScheduler(task, allocateSlots, reservedExecID)
}
return nil
}
func (sm *Manager) failTask(id int64, currState proto.TaskState, err error) {
if err2 := sm.taskMgr.FailTask(sm.ctx, id, currState, err); err2 != nil {
sm.logger.Warn("failed to update task state to failed",
zap.Int64("task-id", id), zap.Error(err2))
}
}
func (sm *Manager) gcSubtaskHistoryTableLoop() {
historySubtaskTableGcInterval := defaultHistorySubtaskTableGcInterval
failpoint.Inject("historySubtaskTableGcInterval", func(val failpoint.Value) {
if seconds, ok := val.(int); ok {
historySubtaskTableGcInterval = time.Second * time.Duration(seconds)
}
<-WaitTaskFinished
})
sm.logger.Info("subtask table gc loop start")
ticker := time.NewTicker(historySubtaskTableGcInterval)
defer ticker.Stop()
for {
select {
case <-sm.ctx.Done():
sm.logger.Info("subtask history table gc loop exits")
return
case <-ticker.C:
err := sm.taskMgr.GCSubtasks(sm.ctx)
if err != nil {
sm.logger.Warn("subtask history table gc failed", zap.Error(err))
} else {
sm.logger.Info("subtask history table gc success")
}
}
}
}
func (sm *Manager) startScheduler(basicTask *proto.TaskBase, allocateSlots bool, reservedExecID string) {
task, err := sm.taskMgr.GetTaskByID(sm.ctx, basicTask.ID)
if err != nil {
sm.logger.Error("get task failed", zap.Int64("task-id", basicTask.ID), zap.Error(err))
return
}
schedulerFactory := getSchedulerFactory(task.Type)
scheduler := schedulerFactory(sm.ctx, task, Param{
taskMgr: sm.taskMgr,
nodeMgr: sm.nodeMgr,
slotMgr: sm.slotMgr,
serverID: sm.serverID,
allocatedSlots: allocateSlots,
})
if err = scheduler.Init(); err != nil {
sm.logger.Error("init scheduler failed", zap.Error(err))
sm.failTask(task.ID, task.State, err)
return
}
sm.addScheduler(task.ID, scheduler)
if allocateSlots {
sm.slotMgr.reserve(basicTask, reservedExecID)
}
sm.logger.Info("task scheduler started", zap.Int64("task-id", task.ID))
sm.schedulerWG.RunWithLog(func() {
defer func() {
scheduler.Close()
sm.delScheduler(task.ID)
if allocateSlots {
sm.slotMgr.unReserve(basicTask, reservedExecID)
}
handle.NotifyTaskChange()
sm.logger.Info("task scheduler exit", zap.Int64("task-id", task.ID))
}()
metrics.UpdateMetricsForRunTask(task)
scheduler.ScheduleTask()
sm.finishCh <- struct{}{}
})
}
func (sm *Manager) cleanupTaskLoop() {
sm.logger.Info("cleanup loop start")
ticker := time.NewTicker(DefaultCleanUpInterval)
defer ticker.Stop()
for {
select {
case <-sm.ctx.Done():
sm.logger.Info("cleanup loop exits")
return
case <-sm.finishCh:
sm.doCleanupTask()
case <-ticker.C:
sm.doCleanupTask()
}
}
}
// WaitCleanUpFinished is used to sync the test.
var WaitCleanUpFinished = make(chan struct{}, 1)
// doCleanupTask processes clean up routine defined by each type of tasks and cleanupMeta.
// For example:
//
// tasks with global sort should clean up tmp files stored on S3.
func (sm *Manager) doCleanupTask() {
tasks, err := sm.taskMgr.GetTasksInStates(
sm.ctx,
proto.TaskStateFailed,
proto.TaskStateReverted,
proto.TaskStateSucceed,
)
if err != nil {
sm.logger.Warn("get task in states failed", zap.Error(err))
return
}
if len(tasks) == 0 {
return
}
sm.logger.Info("cleanup routine start")
err = sm.cleanupFinishedTasks(tasks)
if err != nil {
sm.logger.Warn("cleanup routine failed", zap.Error(err))
return
}
failpoint.Inject("WaitCleanUpFinished", func() {
WaitCleanUpFinished <- struct{}{}
})
sm.logger.Info("cleanup routine success")
}
func (sm *Manager) cleanupFinishedTasks(tasks []*proto.Task) error {
cleanedTasks := make([]*proto.Task, 0)
var firstErr error
for _, task := range tasks {
sm.logger.Info("cleanup task", zap.Int64("task-id", task.ID))
cleanupFactory := getSchedulerCleanUpFactory(task.Type)
if cleanupFactory != nil {
cleanup := cleanupFactory()
err := cleanup.CleanUp(sm.ctx, task)
if err != nil {
firstErr = err
break
}
cleanedTasks = append(cleanedTasks, task)
} else {
// if task doesn't register cleanup function, mark it as cleaned.
cleanedTasks = append(cleanedTasks, task)
}
}
if firstErr != nil {
sm.logger.Warn("cleanup routine failed", zap.Error(errors.Trace(firstErr)))
}
failpoint.Inject("mockTransferErr", func() {
failpoint.Return(errors.New("transfer err"))
})
return sm.taskMgr.TransferTasks2History(sm.ctx, cleanedTasks)
}
func (sm *Manager) collectLoop() {
sm.logger.Info("collect loop start")
ticker := time.NewTicker(defaultCollectMetricsInterval)
defer ticker.Stop()
for {
select {
case <-sm.ctx.Done():
sm.logger.Info("collect loop exits")
return
case <-ticker.C:
sm.collect()
}
}
}
func (sm *Manager) collect() {
subtasks, err := sm.taskMgr.GetAllSubtasks(sm.ctx)
if err != nil {
sm.logger.Warn("get all subtasks failed", zap.Error(err))
return
}
subtaskCollector.subtaskInfo.Store(&subtasks)
}
// MockScheduler mock one scheduler for one task, only used for tests.
func (sm *Manager) MockScheduler(task *proto.Task) *BaseScheduler {
return NewBaseScheduler(sm.ctx, task, Param{
taskMgr: sm.taskMgr,
nodeMgr: sm.nodeMgr,
slotMgr: sm.slotMgr,
serverID: sm.serverID,
})
}