tidb/pkg/disttask/framework/taskexecutor/task_executor.go

// Copyright 2023 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package taskexecutor

import (
	"context"
	"sync"
	"sync/atomic"
	"time"

	"github.com/pingcap/errors"
	"github.com/pingcap/failpoint"
	"github.com/pingcap/log"
	"github.com/pingcap/tidb/br/pkg/lightning/common"
	llog "github.com/pingcap/tidb/br/pkg/lightning/log"
	"github.com/pingcap/tidb/pkg/disttask/framework/handle"
	"github.com/pingcap/tidb/pkg/disttask/framework/proto"
	"github.com/pingcap/tidb/pkg/disttask/framework/scheduler"
	"github.com/pingcap/tidb/pkg/disttask/framework/storage"
	"github.com/pingcap/tidb/pkg/disttask/framework/taskexecutor/execute"
	"github.com/pingcap/tidb/pkg/domain/infosync"
	"github.com/pingcap/tidb/pkg/metrics"
	"github.com/pingcap/tidb/pkg/util"
	"github.com/pingcap/tidb/pkg/util/backoff"
	"github.com/pingcap/tidb/pkg/util/gctuner"
	"github.com/pingcap/tidb/pkg/util/intest"
	"github.com/pingcap/tidb/pkg/util/memory"
	"go.uber.org/zap"
)

var (
	// checkBalanceSubtaskInterval is the default check interval for checking
	// subtasks balance to/away from this node.
	checkBalanceSubtaskInterval = 2 * time.Second
)

var (
	// ErrCancelSubtask is the cancel cause when cancelling subtasks.
	ErrCancelSubtask = errors.New("cancel subtasks")
	// ErrFinishSubtask is the cancel cause when TaskExecutor successfully processed subtasks.
	ErrFinishSubtask = errors.New("finish subtasks")
	// ErrNonIdempotentSubtask means the subtask is left in running state and is not idempotent,
	// so cannot be run again.
	ErrNonIdempotentSubtask = errors.New("subtask in running state and is not idempotent")

	// TestSyncChan is used to sync the test.
	TestSyncChan = make(chan struct{})
)

// BaseTaskExecutor is the base implementation of TaskExecutor.
type BaseTaskExecutor struct {
	// id, it's the same as server id now, i.e. host:port.
	id        string
	task      atomic.Pointer[proto.Task]
	taskTable TaskTable
	logger    *zap.Logger
	ctx       context.Context
	cancel    context.CancelFunc
	Extension

	currSubtaskID atomic.Int64

	mu struct {
		sync.RWMutex
		err error
		// handled indicates whether the error has been updated to one of the subtask.
		handled bool
		// runtimeCancel is used to cancel the Run/Rollback when error occurs.
		runtimeCancel context.CancelCauseFunc
	}
}

// NewBaseTaskExecutor creates a new BaseTaskExecutor.
func NewBaseTaskExecutor(ctx context.Context, id string, task *proto.Task, taskTable TaskTable) *BaseTaskExecutor {
	logger := log.L().With(zap.Int64("task-id", task.ID), zap.String("task-type", string(task.Type)))
	if intest.InTest {
		logger = logger.With(zap.String("server-id", id))
	}
	subCtx, cancelFunc := context.WithCancel(ctx)
	taskExecutorImpl := &BaseTaskExecutor{
		id:        id,
		taskTable: taskTable,
		ctx:       subCtx,
		cancel:    cancelFunc,
		logger:    logger,
	}
	taskExecutorImpl.task.Store(task)
	return taskExecutorImpl
}

// checkBalanceSubtask check whether the subtasks are balanced to or away from this node.
//   - If other subtask of `running` state is scheduled to this node, try changed to
//     `pending` state, to make sure subtasks can be balanced later when node scale out.
//   - If current running subtask are scheduled away from this node, i.e. this node
//     is taken as down, cancel running.
func (e *BaseTaskExecutor) checkBalanceSubtask(ctx context.Context) {
	ticker := time.NewTicker(checkBalanceSubtaskInterval)
	defer ticker.Stop()
	for {
		select {
		case <-ctx.Done():
			return
		case <-ticker.C:
		}

		task := e.task.Load()
		subtasks, err := e.taskTable.GetSubtasksByExecIDAndStepAndStates(ctx, e.id, task.ID, task.Step,
			proto.SubtaskStateRunning)
		if err != nil {
			e.logger.Error("get subtasks failed", zap.Error(err))
			continue
		}
		if len(subtasks) == 0 {
			e.logger.Info("subtask is scheduled away, cancel running")
			// cancels runStep, but leave the subtask state unchanged.
			e.cancelRunStepWith(nil)
			return
		}

		extraRunningSubtasks := make([]*proto.Subtask, 0, len(subtasks))
		for _, st := range subtasks {
			if st.ID == e.currSubtaskID.Load() {
				continue
			}
			if !e.IsIdempotent(st) {
				e.updateSubtaskStateAndError(ctx, st, proto.SubtaskStateFailed, ErrNonIdempotentSubtask)
				return
			}
			extraRunningSubtasks = append(extraRunningSubtasks, st)
		}
		if len(extraRunningSubtasks) > 0 {
			if err = e.taskTable.RunningSubtasksBack2Pending(ctx, extraRunningSubtasks); err != nil {
				e.logger.Error("update running subtasks back to pending failed", zap.Error(err))
			}
		}
	}
}

// Init implements the TaskExecutor interface.
func (*BaseTaskExecutor) Init(_ context.Context) error {
	return nil
}

// Ctx returns the context of the task executor.
// TODO: remove it when add-index.taskexecutor.Init don't depends on it.
func (e *BaseTaskExecutor) Ctx() context.Context {
	return e.ctx
}

// Run implements the TaskExecutor interface.
func (e *BaseTaskExecutor) Run(resource *proto.StepResource) {
	var err error
	// task executor occupies resources, if there's no subtask to run for 10s,
	// we release the resources so that other tasks can use them.
	// 300ms + 600ms + 1.2s + 2s * 4 = 10.1s
	backoffer := backoff.NewExponential(defaultCheckInterval, 2, maxCheckInterval)
	checkInterval, noSubtaskCheckCnt := defaultCheckInterval, 0
	for {
		select {
		case <-e.ctx.Done():
			return
		case <-time.After(checkInterval):
		}
		failpoint.Inject("mockStopManager", func() {
			TestContexts.Store(e.id, &TestContext{make(chan struct{}), atomic.Bool{}})
			go func() {
				v, ok := TestContexts.Load(e.id)
				if ok {
					<-v.(*TestContext).TestSyncSubtaskRun
					infosync.MockGlobalServerInfoManagerEntry.DeleteByExecID(e.id)
				}
			}()
		})
		if err = e.refreshTask(); err != nil {
			if errors.Cause(err) == storage.ErrTaskNotFound {
				return
			}
			e.logger.Error("refresh task failed", zap.Error(err))
			continue
		}
		task := e.task.Load()
		if task.State != proto.TaskStateRunning && task.State != proto.TaskStateReverting {
			return
		}
		if exist, err := e.taskTable.HasSubtasksInStates(e.ctx, e.id, task.ID, task.Step,
			unfinishedSubtaskStates...); err != nil {
			e.logger.Error("check whether there are subtasks to run failed", zap.Error(err))
			continue
		} else if !exist {
			if noSubtaskCheckCnt >= maxChecksWhenNoSubtask {
				e.logger.Info("no subtask to run for a while, exit")
				break
			}
			checkInterval = backoffer.Backoff(noSubtaskCheckCnt)
			noSubtaskCheckCnt++
			continue
		}
		// reset it when we get a subtask
		checkInterval, noSubtaskCheckCnt = defaultCheckInterval, 0

		switch task.State {
		case proto.TaskStateRunning:
			err = e.RunStep(resource)
		case proto.TaskStateReverting:
			// TODO: will remove it later, leave it now.
			err = e.Rollback()
		}
		if err != nil {
			e.logger.Error("failed to handle task", zap.Error(err))
		}
	}
}

// RunStep start to fetch and run all subtasks for the step of task on the node.
// return if there's no subtask to run.
func (e *BaseTaskExecutor) RunStep(resource *proto.StepResource) (err error) {
	defer func() {
		if r := recover(); r != nil {
			e.logger.Error("BaseTaskExecutor panicked", zap.Any("recover", r), zap.Stack("stack"))
			err4Panic := errors.Errorf("%v", r)
			err1 := e.updateSubtask(err4Panic)
			if err == nil {
				err = err1
			}
		}
	}()
	err = e.runStep(resource)
	if e.mu.handled {
		return err
	}
	if err == nil {
		// may have error in
		// 1. defer function in run(ctx, task)
		// 2. cancel ctx
		// TODO: refine onError/getError
		if e.getError() != nil {
			err = e.getError()
		} else if e.ctx.Err() != nil {
			err = e.ctx.Err()
		} else {
			return nil
		}
	}

	return e.updateSubtask(err)
}

func (e *BaseTaskExecutor) runStep(resource *proto.StepResource) (resErr error) {
	runStepCtx, runStepCancel := context.WithCancelCause(e.ctx)
	e.registerRunStepCancelFunc(runStepCancel)
	defer func() {
		runStepCancel(ErrFinishSubtask)
		e.unregisterRunStepCancelFunc()
	}()
	e.resetError()
	task := e.task.Load()
	stepLogger := llog.BeginTask(e.logger.With(
		zap.String("step", proto.Step2Str(task.Type, task.Step)),
		zap.Float64("mem-limit-percent", gctuner.GlobalMemoryLimitTuner.GetPercentage()),
		zap.String("server-mem-limit", memory.ServerMemoryLimitOriginText.Load()),
		zap.Stringer("resource", resource),
	), "execute task step")
	// log as info level, subtask might be cancelled, let caller check it.
	defer func() {
		stepLogger.End(zap.InfoLevel, resErr)
	}()

	summary, cleanup, err := runSummaryCollectLoop(runStepCtx, task, e.taskTable)
	if err != nil {
		e.onError(err)
		return e.getError()
	}
	defer cleanup()
	stepExecutor, err := e.GetStepExecutor(task, summary, resource)
	if err != nil {
		e.onError(err)
		return e.getError()
	}

	failpoint.Inject("mockExecSubtaskInitEnvErr", func() {
		failpoint.Return(errors.New("mockExecSubtaskInitEnvErr"))
	})
	if err := stepExecutor.Init(runStepCtx); err != nil {
		e.onError(err)
		return e.getError()
	}

	defer func() {
		err := stepExecutor.Cleanup(runStepCtx)
		if err != nil {
			e.logger.Error("cleanup subtask exec env failed", zap.Error(err))
			e.onError(err)
		}
	}()

	subtasks, err := e.taskTable.GetSubtasksByExecIDAndStepAndStates(
		runStepCtx, e.id, task.ID, task.Step,
		proto.SubtaskStatePending, proto.SubtaskStateRunning)
	if err != nil {
		e.onError(err)
		return e.getError()
	}
	for _, subtask := range subtasks {
		metrics.IncDistTaskSubTaskCnt(subtask)
		metrics.StartDistTaskSubTask(subtask)
	}

	for {
		// check if any error occurs.
		if err := e.getError(); err != nil {
			break
		}
		if runStepCtx.Err() != nil {
			break
		}

		subtask, err := e.taskTable.GetFirstSubtaskInStates(runStepCtx, e.id, task.ID, task.Step,
			proto.SubtaskStatePending, proto.SubtaskStateRunning)
		if err != nil {
			e.logger.Warn("GetFirstSubtaskInStates meets error", zap.Error(err))
			continue
		}
		if subtask == nil {
			break
		}

		if subtask.State == proto.SubtaskStateRunning {
			if !e.IsIdempotent(subtask) {
				e.logger.Info("subtask in running state and is not idempotent, fail it",
					zap.Int64("subtask-id", subtask.ID))
				e.onError(ErrNonIdempotentSubtask)
				e.updateSubtaskStateAndError(runStepCtx, subtask, proto.SubtaskStateFailed, ErrNonIdempotentSubtask)
				e.markErrorHandled()
				break
			}
		} else {
			// subtask.State == proto.SubtaskStatePending
			err := e.startSubtaskAndUpdateState(runStepCtx, subtask)
			if err != nil {
				e.logger.Warn("startSubtaskAndUpdateState meets error", zap.Error(err))
				// should ignore ErrSubtaskNotFound
				// since the err only indicate that the subtask not owned by current task executor.
				if err == storage.ErrSubtaskNotFound {
					continue
				}
				e.onError(err)
				continue
			}
		}

		failpoint.Inject("mockCleanExecutor", func() {
			v, ok := TestContexts.Load(e.id)
			if ok {
				if v.(*TestContext).mockDown.Load() {
					failpoint.Break()
				}
			}
		})

		failpoint.Inject("cancelBeforeRunSubtask", func() {
			runStepCancel(nil)
		})

		e.runSubtask(runStepCtx, stepExecutor, subtask)
	}
	return e.getError()
}

func (e *BaseTaskExecutor) runSubtask(ctx context.Context, stepExecutor execute.StepExecutor, subtask *proto.Subtask) {
	err := func() error {
		e.currSubtaskID.Store(subtask.ID)

		var wg util.WaitGroupWrapper
		checkCtx, checkCancel := context.WithCancel(ctx)
		wg.RunWithLog(func() {
			e.checkBalanceSubtask(checkCtx)
		})
		defer func() {
			checkCancel()
			wg.Wait()
		}()

		return stepExecutor.RunSubtask(ctx, subtask)
	}()
	failpoint.Inject("MockRunSubtaskCancel", func(val failpoint.Value) {
		if val.(bool) {
			err = ErrCancelSubtask
		}
	})

	failpoint.Inject("MockRunSubtaskContextCanceled", func(val failpoint.Value) {
		if val.(bool) {
			err = context.Canceled
		}
	})

	if err != nil {
		e.onError(err)
	}

	finished := e.markSubTaskCanceledOrFailed(ctx, subtask)
	if finished {
		return
	}

	failpoint.Inject("mockTiDBDown", func(val failpoint.Value) {
		e.logger.Info("trigger mockTiDBDown")
		if e.id == val.(string) || e.id == ":4001" || e.id == ":4002" {
			v, ok := TestContexts.Load(e.id)
			if ok {
				v.(*TestContext).TestSyncSubtaskRun <- struct{}{}
				v.(*TestContext).mockDown.Store(true)
				e.logger.Info("mockTiDBDown")
				time.Sleep(2 * time.Second)
				failpoint.Return()
			}
		}
	})
	failpoint.Inject("mockTiDBDown2", func() {
		if e.id == ":4003" && subtask.Step == proto.StepTwo {
			v, ok := TestContexts.Load(e.id)
			if ok {
				v.(*TestContext).TestSyncSubtaskRun <- struct{}{}
				v.(*TestContext).mockDown.Store(true)
				time.Sleep(2 * time.Second)
				return
			}
		}
	})

	failpoint.Inject("mockTiDBPartitionThenResume", func(val failpoint.Value) {
		if val.(bool) && (e.id == ":4000" || e.id == ":4001" || e.id == ":4002") {
			infosync.MockGlobalServerInfoManagerEntry.DeleteByExecID(e.id)
			time.Sleep(20 * time.Second)
		}
	})

	failpoint.Inject("MockExecutorRunErr", func(val failpoint.Value) {
		if val.(bool) {
			e.onError(errors.New("MockExecutorRunErr"))
		}
	})
	failpoint.Inject("MockExecutorRunCancel", func(val failpoint.Value) {
		if taskID, ok := val.(int); ok {
			mgr, err := storage.GetTaskManager()
			if err != nil {
				e.logger.Error("get task manager failed", zap.Error(err))
			} else {
				err = mgr.CancelTask(ctx, int64(taskID))
				if err != nil {
					e.logger.Error("cancel task failed", zap.Error(err))
				}
			}
		}
	})
	e.onSubtaskFinished(ctx, stepExecutor, subtask)
}

func (e *BaseTaskExecutor) onSubtaskFinished(ctx context.Context, executor execute.StepExecutor, subtask *proto.Subtask) {
	if err := e.getError(); err == nil {
		if err = executor.OnFinished(ctx, subtask); err != nil {
			e.onError(err)
		}
	}
	failpoint.Inject("MockSubtaskFinishedCancel", func(val failpoint.Value) {
		if val.(bool) {
			e.onError(ErrCancelSubtask)
		}
	})

	finished := e.markSubTaskCanceledOrFailed(ctx, subtask)
	if finished {
		return
	}

	e.finishSubtaskAndUpdateState(ctx, subtask)

	finished = e.markSubTaskCanceledOrFailed(ctx, subtask)
	if finished {
		return
	}

	failpoint.Inject("syncAfterSubtaskFinish", func() {
		TestSyncChan <- struct{}{}
		<-TestSyncChan
	})
}

// Rollback rollbacks the subtask.
// TODO no need to start executor to do it, refactor it later.
func (e *BaseTaskExecutor) Rollback() error {
	task := e.task.Load()
	e.resetError()
	e.logger.Info("taskExecutor rollback a step", zap.String("step", proto.Step2Str(task.Type, task.Step)))

	// We should cancel all subtasks before rolling back
	for {
		// TODO we can update them using one sql, but requires change the metric
		// gathering logic.
		subtask, err := e.taskTable.GetFirstSubtaskInStates(e.ctx, e.id, task.ID, task.Step,
			proto.SubtaskStatePending, proto.SubtaskStateRunning)
		if err != nil {
			e.onError(err)
			return e.getError()
		}

		if subtask == nil {
			break
		}

		e.updateSubtaskStateAndError(e.ctx, subtask, proto.SubtaskStateCanceled, nil)
		if err = e.getError(); err != nil {
			return err
		}
	}
	return e.getError()
}

// GetTask implements TaskExecutor.GetTask.
func (e *BaseTaskExecutor) GetTask() *proto.Task {
	return e.task.Load()
}

// CancelRunningSubtask implements TaskExecutor.CancelRunningSubtask.
func (e *BaseTaskExecutor) CancelRunningSubtask() {
	e.cancelRunStepWith(ErrCancelSubtask)
}

// Cancel implements TaskExecutor.Cancel.
func (e *BaseTaskExecutor) Cancel() {
	e.cancel()
}

// Close closes the TaskExecutor when all the subtasks are complete.
func (e *BaseTaskExecutor) Close() {
	e.Cancel()
}

// refreshTask fetch task state from tidb_global_task table.
func (e *BaseTaskExecutor) refreshTask() error {
	task := e.GetTask()
	newTask, err := e.taskTable.GetTaskByID(e.ctx, task.ID)
	if err != nil {
		return err
	}
	e.task.Store(newTask)
	return nil
}

func runSummaryCollectLoop(
	ctx context.Context,
	task *proto.Task,
	taskTable TaskTable,
) (summary *execute.Summary, cleanup func(), err error) {
	failpoint.Inject("mockSummaryCollectErr", func() {
		failpoint.Return(nil, func() {}, errors.New("summary collect err"))
	})
	taskMgr, ok := taskTable.(*storage.TaskManager)
	if !ok {
		return nil, func() {}, nil
	}
	opt, ok := taskTypes[task.Type]
	if !ok {
		return nil, func() {}, errors.Errorf("taskExecutor option for type %s not found", task.Type)
	}
	if opt.Summary != nil {
		go opt.Summary.UpdateRowCountLoop(ctx, taskMgr)
		return opt.Summary, func() {
			opt.Summary.PersistRowCount(ctx, taskMgr)
		}, nil
	}
	return nil, func() {}, nil
}

func (e *BaseTaskExecutor) registerRunStepCancelFunc(cancel context.CancelCauseFunc) {
	e.mu.Lock()
	defer e.mu.Unlock()
	e.mu.runtimeCancel = cancel
}

func (e *BaseTaskExecutor) unregisterRunStepCancelFunc() {
	e.mu.Lock()
	defer e.mu.Unlock()
	e.mu.runtimeCancel = nil
}

func (e *BaseTaskExecutor) cancelRunStepWith(cause error) {
	e.mu.Lock()
	defer e.mu.Unlock()
	if e.mu.runtimeCancel != nil {
		e.mu.runtimeCancel(cause)
	}
}

func (e *BaseTaskExecutor) onError(err error) {
	if err == nil {
		return
	}
	err = errors.Trace(err)
	e.logger.Error("onError", zap.Error(err), zap.Stack("stack"))
	e.mu.Lock()
	defer e.mu.Unlock()

	if e.mu.err == nil {
		e.mu.err = err
		e.logger.Error("taskExecutor met first error", zap.Error(err))
	}

	if e.mu.runtimeCancel != nil {
		e.mu.runtimeCancel(err)
	}
}

func (e *BaseTaskExecutor) markErrorHandled() {
	e.mu.Lock()
	defer e.mu.Unlock()
	e.mu.handled = true
}

func (e *BaseTaskExecutor) getError() error {
	e.mu.RLock()
	defer e.mu.RUnlock()
	return e.mu.err
}

func (e *BaseTaskExecutor) resetError() {
	e.mu.Lock()
	defer e.mu.Unlock()
	e.mu.err = nil
	e.mu.handled = false
}

func (e *BaseTaskExecutor) startSubtaskAndUpdateState(ctx context.Context, subtask *proto.Subtask) error {
	err := e.startSubtask(ctx, subtask.ID)
	if err == nil {
		metrics.DecDistTaskSubTaskCnt(subtask)
		metrics.EndDistTaskSubTask(subtask)
		subtask.State = proto.SubtaskStateRunning
		metrics.IncDistTaskSubTaskCnt(subtask)
		metrics.StartDistTaskSubTask(subtask)
	}
	return err
}

func (e *BaseTaskExecutor) updateSubtaskStateAndErrorImpl(ctx context.Context, execID string, subtaskID int64, state proto.SubtaskState, subTaskErr error) {
	// retry for 3+6+12+24+(30-4)*30 ~= 825s ~= 14 minutes
	backoffer := backoff.NewExponential(scheduler.RetrySQLInterval, 2, scheduler.RetrySQLMaxInterval)
	err := handle.RunWithRetry(ctx, scheduler.RetrySQLTimes, backoffer, e.logger,
		func(ctx context.Context) (bool, error) {
			return true, e.taskTable.UpdateSubtaskStateAndError(ctx, execID, subtaskID, state, subTaskErr)
		},
	)
	if err != nil {
		e.onError(err)
	}
}

// startSubtask try to change the state of the subtask to running.
// If the subtask is not owned by the task executor,
// the update will fail and task executor should not run the subtask.
func (e *BaseTaskExecutor) startSubtask(ctx context.Context, subtaskID int64) error {
	// retry for 3+6+12+24+(30-4)*30 ~= 825s ~= 14 minutes
	backoffer := backoff.NewExponential(scheduler.RetrySQLInterval, 2, scheduler.RetrySQLMaxInterval)
	return handle.RunWithRetry(ctx, scheduler.RetrySQLTimes, backoffer, e.logger,
		func(ctx context.Context) (bool, error) {
			err := e.taskTable.StartSubtask(ctx, subtaskID, e.id)
			if err == storage.ErrSubtaskNotFound {
				// No need to retry.
				return false, err
			}
			return true, err
		},
	)
}

func (e *BaseTaskExecutor) finishSubtask(ctx context.Context, subtask *proto.Subtask) {
	backoffer := backoff.NewExponential(scheduler.RetrySQLInterval, 2, scheduler.RetrySQLMaxInterval)
	err := handle.RunWithRetry(ctx, scheduler.RetrySQLTimes, backoffer, e.logger,
		func(ctx context.Context) (bool, error) {
			return true, e.taskTable.FinishSubtask(ctx, subtask.ExecID, subtask.ID, subtask.Meta)
		},
	)
	if err != nil {
		e.onError(err)
	}
}

func (e *BaseTaskExecutor) updateSubtaskStateAndError(ctx context.Context, subtask *proto.Subtask, state proto.SubtaskState, subTaskErr error) {
	metrics.DecDistTaskSubTaskCnt(subtask)
	metrics.EndDistTaskSubTask(subtask)
	e.updateSubtaskStateAndErrorImpl(ctx, subtask.ExecID, subtask.ID, state, subTaskErr)
	subtask.State = state
	metrics.IncDistTaskSubTaskCnt(subtask)
	if !subtask.IsDone() {
		metrics.StartDistTaskSubTask(subtask)
	}
}

func (e *BaseTaskExecutor) finishSubtaskAndUpdateState(ctx context.Context, subtask *proto.Subtask) {
	metrics.DecDistTaskSubTaskCnt(subtask)
	metrics.EndDistTaskSubTask(subtask)
	e.finishSubtask(ctx, subtask)
	subtask.State = proto.SubtaskStateSucceed
	metrics.IncDistTaskSubTaskCnt(subtask)
}

// markSubTaskCanceledOrFailed check the error type and decide the subtasks' state.
// 1. Only cancel subtasks when meet ErrCancelSubtask.
// 2. Only fail subtasks when meet non retryable error.
// 3. When meet other errors, don't change subtasks' state.
func (e *BaseTaskExecutor) markSubTaskCanceledOrFailed(ctx context.Context, subtask *proto.Subtask) bool {
	if err := e.getError(); err != nil {
		err := errors.Cause(err)
		if ctx.Err() != nil && context.Cause(ctx) == ErrCancelSubtask {
			e.logger.Warn("subtask canceled", zap.Error(err))
			e.updateSubtaskStateAndError(e.ctx, subtask, proto.SubtaskStateCanceled, nil)
		} else if e.IsRetryableError(err) {
			e.logger.Warn("meet retryable error", zap.Error(err))
		} else if common.IsContextCanceledError(err) {
			e.logger.Info("meet context canceled for gracefully shutdown", zap.Error(err))
		} else {
			e.logger.Warn("subtask failed", zap.Error(err))
			e.updateSubtaskStateAndError(e.ctx, subtask, proto.SubtaskStateFailed, err)
		}
		e.markErrorHandled()
		return true
	}
	return false
}

func (e *BaseTaskExecutor) failSubtaskWithRetry(ctx context.Context, taskID int64, err error) error {
	backoffer := backoff.NewExponential(scheduler.RetrySQLInterval, 2, scheduler.RetrySQLMaxInterval)
	err1 := handle.RunWithRetry(e.ctx, scheduler.RetrySQLTimes, backoffer, e.logger,
		func(_ context.Context) (bool, error) {
			return true, e.taskTable.FailSubtask(ctx, e.id, taskID, err)
		},
	)
	if err1 == nil {
		e.logger.Info("failed one subtask succeed", zap.NamedError("subtask-err", err))
	}
	return err1
}

func (e *BaseTaskExecutor) cancelSubtaskWithRetry(ctx context.Context, taskID int64, err error) error {
	e.logger.Warn("subtask canceled", zap.NamedError("subtask-cancel", err))
	backoffer := backoff.NewExponential(scheduler.RetrySQLInterval, 2, scheduler.RetrySQLMaxInterval)
	err1 := handle.RunWithRetry(e.ctx, scheduler.RetrySQLTimes, backoffer, e.logger,
		func(_ context.Context) (bool, error) {
			return true, e.taskTable.CancelSubtask(ctx, e.id, taskID)
		},
	)
	if err1 == nil {
		e.logger.Info("canceled one subtask succeed", zap.NamedError("subtask-cancel", err))
	}
	return err1
}

// updateSubtask check the error type and decide the subtasks' state.
// 1. Only cancel subtasks when meet ErrCancelSubtask.
// 2. Only fail subtasks when meet non retryable error.
// 3. When meet other errors, don't change subtasks' state.
// Handled errors should not happen during subtasks execution.
// Only handle errors before subtasks execution and after subtasks execution.
func (e *BaseTaskExecutor) updateSubtask(err error) error {
	task := e.task.Load()
	err = errors.Cause(err)
	// TODO this branch is unreachable now, remove it when we refactor error handling.
	if e.ctx.Err() != nil && context.Cause(e.ctx) == ErrCancelSubtask {
		return e.cancelSubtaskWithRetry(e.ctx, task.ID, ErrCancelSubtask)
	} else if e.IsRetryableError(err) {
		e.logger.Warn("meet retryable error", zap.Error(err))
	} else if common.IsContextCanceledError(err) {
		e.logger.Info("meet context canceled for gracefully shutdown", zap.Error(err))
	} else {
		return e.failSubtaskWithRetry(e.ctx, task.ID, err)
	}
	return nil
}