Files
tidb/executor/asyncloaddata/util.go
2023-04-13 14:09:01 +08:00

586 lines
16 KiB
Go

// Copyright 2023 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package asyncloaddata
import (
"context"
"fmt"
"net/url"
"time"
"github.com/pingcap/errors"
"github.com/pingcap/failpoint"
"github.com/pingcap/tidb/kv"
"github.com/pingcap/tidb/parser/terror"
"github.com/pingcap/tidb/types"
"github.com/pingcap/tidb/util/chunk"
"github.com/pingcap/tidb/util/dbterror/exeerrors"
"github.com/pingcap/tidb/util/logutil"
"github.com/pingcap/tidb/util/sqlexec"
"github.com/tikv/client-go/v2/util"
"go.uber.org/zap"
)
// Job import job.
type Job struct {
ID int64
// Job don't manage the life cycle of the connection.
Conn sqlexec.SQLExecutor
User string
}
// NewJob returns new Job.
func NewJob(ID int64, conn sqlexec.SQLExecutor, user string) *Job {
return &Job{ID: ID, Conn: conn, User: user}
}
// CreateLoadDataJob creates a load data job by insert a record to system table.
// The AUTO_INCREMENT value will be returned as jobID.
func CreateLoadDataJob(
ctx context.Context,
conn sqlexec.SQLExecutor,
dataSource, db, table string,
importMode string,
user string,
) (*Job, error) {
// remove the params in data source URI because it may contains AK/SK
u, err := url.Parse(dataSource)
if err == nil && u.Scheme != "" {
u.RawQuery = ""
u.Fragment = ""
dataSource = u.String()
}
ctx = util.WithInternalSourceType(ctx, kv.InternalLoadData)
_, err = conn.ExecuteInternal(ctx,
`INSERT INTO mysql.load_data_jobs
(data_source, table_schema, table_name, import_mode, create_user)
VALUES (%?, %?, %?, %?, %?);`,
dataSource, db, table, importMode, user)
if err != nil {
return nil, err
}
rs, err := conn.ExecuteInternal(ctx, `SELECT LAST_INSERT_ID();`)
if err != nil {
return nil, err
}
//nolint: errcheck
defer rs.Close()
rows, err := sqlexec.DrainRecordSet(ctx, rs, 1)
if err != nil {
return nil, err
}
if len(rows) != 1 {
return nil, errors.Errorf("unexpected result length: %d", len(rows))
}
return NewJob(rows[0].GetInt64(0), conn, user), nil
}
// TestSyncCh is used in unit test to synchronize the execution of LOAD DATA.
var TestSyncCh = make(chan struct{})
// StartJob tries to start a not-yet-started job with jobID. It will not return
// error when there's no matched job.
func (j *Job) StartJob(ctx context.Context) error {
failpoint.Inject("AfterCreateLoadDataJob", nil)
failpoint.Inject("SyncAfterCreateLoadDataJob", func() {
TestSyncCh <- struct{}{}
<-TestSyncCh
})
ctx = util.WithInternalSourceType(ctx, kv.InternalLoadData)
_, err := j.Conn.ExecuteInternal(ctx,
`UPDATE mysql.load_data_jobs
SET start_time = CURRENT_TIMESTAMP(6), update_time = CURRENT_TIMESTAMP(6)
WHERE job_id = %? AND start_time IS NULL AND end_time IS NULL;`,
j.ID)
if err != nil {
return err
}
failpoint.Inject("AfterStartJob", nil)
failpoint.Inject("SyncAfterStartJob", func() {
TestSyncCh <- struct{}{}
<-TestSyncCh
})
return nil
}
var (
// HeartBeatInSec is the interval of heartbeat.
HeartBeatInSec = 5
// OfflineThresholdInSec means after failing to update heartbeat for 3 times,
// we treat the worker of the job as offline.
OfflineThresholdInSec = HeartBeatInSec * 3
)
// UpdateJobProgress updates the progress of a load data job. It should be called
// periodically as heartbeat after StartJob.
// The returned bool indicates whether the keepalive is succeeded. If not, the
// caller should call FailJob soon.
// TODO: Currently if the node is crashed after CreateLoadDataJob and before StartJob,
// it will always be in the status of pending. Maybe we should unify CreateLoadDataJob
// and StartJob.
func (j *Job) UpdateJobProgress(ctx context.Context, progress string) (bool, error) {
ctx = util.WithInternalSourceType(ctx, kv.InternalLoadData)
// let TiDB handle heartbeat check for concurrent SQL
// we tolerate 2 times of failure/timeout when updating heartbeat
_, err := j.Conn.ExecuteInternal(ctx,
`UPDATE mysql.load_data_jobs
SET progress = %?, update_time = CURRENT_TIMESTAMP(6)
WHERE job_id = %?
AND end_time IS NULL
AND (update_time >= DATE_SUB(CURRENT_TIMESTAMP(6), INTERVAL %? SECOND)
OR update_time IS NULL);`,
progress, j.ID, OfflineThresholdInSec)
if err != nil {
return false, err
}
return j.Conn.GetSessionVars().StmtCtx.AffectedRows() == 1, nil
}
// FinishJob finishes a load data job. A job can only be finished once.
func (j *Job) FinishJob(ctx context.Context, result string) error {
ctx = util.WithInternalSourceType(ctx, kv.InternalLoadData)
_, err := j.Conn.ExecuteInternal(ctx,
`UPDATE mysql.load_data_jobs
SET end_time = CURRENT_TIMESTAMP(6), result_message = %?
WHERE job_id = %? AND result_message IS NULL AND error_message IS NULL;`,
result, j.ID)
return err
}
// FailJob fails a load data job. A job can only be failed once.
func (j *Job) FailJob(ctx context.Context, result string) error {
ctx = util.WithInternalSourceType(ctx, kv.InternalLoadData)
_, err := j.Conn.ExecuteInternal(ctx,
`UPDATE mysql.load_data_jobs
SET end_time = CURRENT_TIMESTAMP(6), error_message = %?
WHERE job_id = %? AND result_message IS NULL AND error_message IS NULL;`,
result, j.ID)
return err
}
// CancelJob cancels a load data job. Only a running/paused job can be canceled.
func (j *Job) CancelJob(ctx context.Context) (err error) {
ctx = util.WithInternalSourceType(ctx, kv.InternalLoadData)
_, err = j.Conn.ExecuteInternal(ctx, "BEGIN PESSIMISTIC;")
if err != nil {
return err
}
defer func() {
if err != nil {
_, err1 := j.Conn.ExecuteInternal(ctx, "ROLLBACK;")
terror.Log(err1)
return
}
_, err = j.Conn.ExecuteInternal(ctx, "COMMIT;")
if err != nil {
return
}
}()
var (
rs sqlexec.RecordSet
rows []chunk.Row
)
rs, err = j.Conn.ExecuteInternal(ctx,
`SELECT expected_status, end_time, error_message FROM mysql.load_data_jobs
WHERE job_id = %? AND create_user = %?;`,
j.ID, j.User)
if err != nil {
return err
}
defer terror.Call(rs.Close)
rows, err = sqlexec.DrainRecordSet(ctx, rs, 1)
if err != nil {
return err
}
if len(rows) < 1 {
return exeerrors.ErrLoadDataJobNotFound.GenWithStackByArgs(j.ID)
}
status := rows[0].GetEnum(0).String()
if status != "running" && status != "paused" {
return exeerrors.ErrLoadDataInvalidOperation.GenWithStackByArgs(fmt.Sprintf("need status running or paused, but got %s", status))
}
endTimeIsNull := rows[0].IsNull(1)
if !endTimeIsNull {
hasError := !rows[0].IsNull(2)
if hasError {
return exeerrors.ErrLoadDataInvalidOperation.GenWithStackByArgs("need status running or paused, but got failed")
}
return exeerrors.ErrLoadDataInvalidOperation.GenWithStackByArgs("need status running or paused, but got finished")
}
_, err = j.Conn.ExecuteInternal(ctx,
`UPDATE mysql.load_data_jobs
SET expected_status = 'canceled',
end_time = CURRENT_TIMESTAMP(6),
error_message = 'canceled by user'
WHERE job_id = %?;`,
j.ID)
return err
}
// DropJob drops a load data job.
func (j *Job) DropJob(ctx context.Context) error {
ctx = util.WithInternalSourceType(ctx, kv.InternalLoadData)
_, err := j.Conn.ExecuteInternal(ctx,
`DELETE FROM mysql.load_data_jobs
WHERE job_id = %? AND create_user = %?;`,
j.ID, j.User)
if err == nil {
return err
}
if j.Conn.GetSessionVars().StmtCtx.AffectedRows() < 1 {
return exeerrors.ErrLoadDataJobNotFound.GenWithStackByArgs(j.ID)
}
return nil
}
// OnComplete is called when a job is finished or failed.
func (j *Job) OnComplete(inErr error, msg string) {
// write the ending status even if user context is canceled.
ctx2 := context.Background()
ctx2 = kv.WithInternalSourceType(ctx2, kv.InternalLoadData)
if inErr == nil {
err2 := j.FinishJob(ctx2, msg)
terror.Log(err2)
return
}
errMsg := inErr.Error()
if errImpl, ok := errors.Cause(inErr).(*errors.Error); ok {
b, marshalErr := errImpl.MarshalJSON()
if marshalErr == nil {
errMsg = string(b)
}
}
err2 := j.FailJob(ctx2, errMsg)
terror.Log(err2)
}
// ProgressUpdateRoutineFn job progress update routine.
func (j *Job) ProgressUpdateRoutineFn(ctx context.Context, finishCh chan struct{}, errCh <-chan struct{}, progress *Progress) error {
ticker := time.NewTicker(time.Duration(HeartBeatInSec) * time.Second)
defer ticker.Stop()
for {
select {
case <-finishCh:
// When done, try to update progress to reach 100%
ok, err2 := j.UpdateJobProgress(ctx, progress.String())
if !ok || err2 != nil {
logutil.Logger(ctx).Warn("failed to update job progress when finished",
zap.Bool("ok", ok), zap.Error(err2))
}
return nil
case <-errCh:
return nil
case <-ticker.C:
ok, err2 := j.UpdateJobProgress(ctx, progress.String())
if err2 != nil {
return err2
}
if !ok {
return errors.Errorf("failed to update job progress, the job %d is interrupted by user or failed to keepalive", j.ID)
}
}
}
}
// JobExpectedStatus is the expected status of a load data job. User can set the
// expected status of a job and worker will respect it.
type JobExpectedStatus int
const (
// JobExpectedRunning means the job is expected to be running.
JobExpectedRunning JobExpectedStatus = iota
// JobExpectedPaused means the job is expected to be paused.
JobExpectedPaused
// JobExpectedCanceled means the job is expected to be canceled.
JobExpectedCanceled
)
// UpdateJobExpectedStatus updates the expected status of a load data job.
// TODO: remove it?
func UpdateJobExpectedStatus(
ctx context.Context,
conn sqlexec.SQLExecutor,
jobID int64,
status JobExpectedStatus,
) error {
ctx = util.WithInternalSourceType(ctx, kv.InternalLoadData)
var sql string
switch status {
case JobExpectedRunning:
sql = `UPDATE mysql.load_data_jobs
SET expected_status = 'running'
WHERE job_id = %? AND expected_status = 'paused';`
case JobExpectedPaused:
sql = `UPDATE mysql.load_data_jobs
SET expected_status = 'paused'
WHERE job_id = %? AND expected_status = 'running';`
case JobExpectedCanceled:
sql = `UPDATE mysql.load_data_jobs
SET expected_status = 'canceled'
WHERE job_id = %? AND expected_status != 'canceled';`
}
_, err := conn.ExecuteInternal(ctx, sql, jobID)
return err
}
// JobStatus represents the status of a load data job.
type JobStatus int
const (
// JobFailed means the job is failed and can't be resumed.
JobFailed JobStatus = iota
// JobCanceled means the job is canceled by user and can't be resumed. It
// will finally convert to JobFailed with a message indicating the reason
// is canceled.
JobCanceled
// JobPaused means the job is paused by user and can be resumed.
JobPaused
// JobFinished means the job is finished.
JobFinished
// JobPending means the job is pending to be started.
JobPending
// JobRunning means the job is running.
JobRunning
)
func (s JobStatus) String() string {
switch s {
case JobFailed:
return "failed"
case JobCanceled:
return "canceled"
case JobPaused:
return "paused"
case JobFinished:
return "finished"
case JobPending:
return "pending"
case JobRunning:
return "running"
default:
return "unknown JobStatus"
}
}
// GetJobStatus gets the status of a load data job. The returned error means
// something wrong when querying the database. Other business logic errors are
// returned as JobFailed with message.
func (j *Job) GetJobStatus(ctx context.Context) (JobStatus, string, error) {
ctx = util.WithInternalSourceType(ctx, kv.InternalLoadData)
rs, err := j.Conn.ExecuteInternal(ctx,
`SELECT
expected_status,
update_time >= DATE_SUB(CURRENT_TIMESTAMP(6), INTERVAL %? SECOND) AS is_alive,
end_time,
result_message,
error_message,
start_time
FROM mysql.load_data_jobs
WHERE job_id = %?;`,
OfflineThresholdInSec, j.ID)
if err != nil {
return JobFailed, "", err
}
defer terror.Call(rs.Close)
rows, err := sqlexec.DrainRecordSet(ctx, rs, 1)
if err != nil {
return JobFailed, "", err
}
if len(rows) != 1 {
return JobFailed, exeerrors.ErrLoadDataJobNotFound.GenWithStackByArgs(j.ID).Error(), nil
}
return getJobStatus(rows[0])
}
// getJobStatus expected the first 6 columns of input row is (expected_status,
// is_alive (derived from update_time), end_time, result_message, error_message,
// start_time).
func getJobStatus(row chunk.Row) (JobStatus, string, error) {
// ending status has the highest priority
expectedStatus := row.GetEnum(0).String()
endTimeIsNull := row.IsNull(2)
if !endTimeIsNull {
resultMsgIsNull := row.IsNull(3)
if !resultMsgIsNull {
resultMessage := row.GetString(3)
return JobFinished, resultMessage, nil
}
errorMessage := row.GetString(4)
if expectedStatus == "canceled" {
return JobCanceled, errorMessage, nil
}
return JobFailed, errorMessage, nil
}
isAlive := row.GetInt64(1) == 1
startTimeIsNull := row.IsNull(5)
switch expectedStatus {
case "canceled":
return JobCanceled, "", nil
case "paused":
if startTimeIsNull || isAlive {
return JobPaused, "", nil
}
return JobFailed, "job expected paused but the node is timeout", nil
case "running":
if startTimeIsNull {
return JobPending, "", nil
}
if isAlive {
return JobRunning, "", nil
}
return JobFailed, "job expected running but the node is timeout", nil
default:
return JobFailed, fmt.Sprintf("unexpected job status %s", expectedStatus), nil
}
}
// JobInfo is the information of a load data job.
type JobInfo struct {
JobID int64
User string
DataSource string
TableSchema string
TableName string
ImportMode string
Progress string
Status JobStatus
StatusMessage string
CreateTime types.Time
StartTime types.Time
EndTime types.Time
}
// GetJobInfo gets all needed information of a load data job.
func (j *Job) GetJobInfo(ctx context.Context) (*JobInfo, error) {
ctx = util.WithInternalSourceType(ctx, kv.InternalLoadData)
rs, err := j.Conn.ExecuteInternal(ctx,
`SELECT
expected_status,
update_time >= DATE_SUB(CURRENT_TIMESTAMP(6), INTERVAL %? SECOND) AS is_alive,
end_time,
result_message,
error_message,
start_time,
job_id,
data_source,
table_schema,
table_name,
import_mode,
progress,
create_user,
create_time
FROM mysql.load_data_jobs
WHERE job_id = %? AND create_user = %?;`,
OfflineThresholdInSec, j.ID, j.User)
if err != nil {
return nil, err
}
defer terror.Call(rs.Close)
rows, err := sqlexec.DrainRecordSet(ctx, rs, 1)
if err != nil {
return nil, err
}
if len(rows) != 1 {
return nil, exeerrors.ErrLoadDataJobNotFound.GenWithStackByArgs(j.ID)
}
return getJobInfo(rows[0])
}
// getJobInfo expected the columns of input row is (expected_status,
// is_alive (derived from update_time), end_time, result_message, error_message,
// start_time, job_id, data_source, table_schema, table_name, import_mode,
// progress, create_user).
func getJobInfo(row chunk.Row) (*JobInfo, error) {
var err error
jobInfo := JobInfo{
JobID: row.GetInt64(6),
DataSource: row.GetString(7),
TableSchema: row.GetString(8),
TableName: row.GetString(9),
ImportMode: row.GetString(10),
Progress: row.GetString(11),
User: row.GetString(12),
CreateTime: row.GetTime(13),
StartTime: row.GetTime(5),
EndTime: row.GetTime(2),
}
jobInfo.Status, jobInfo.StatusMessage, err = getJobStatus(row)
if err != nil {
return nil, err
}
return &jobInfo, nil
}
// GetAllJobInfo gets all jobs status of a user.
func GetAllJobInfo(
ctx context.Context,
conn sqlexec.SQLExecutor,
user string,
) ([]*JobInfo, error) {
ctx = util.WithInternalSourceType(ctx, kv.InternalLoadData)
rs, err := conn.ExecuteInternal(ctx,
`SELECT
expected_status,
update_time >= DATE_SUB(CURRENT_TIMESTAMP(6), INTERVAL %? SECOND) AS is_alive,
end_time,
result_message,
error_message,
start_time,
job_id,
data_source,
table_schema,
table_name,
import_mode,
progress,
create_user,
create_time
FROM mysql.load_data_jobs
WHERE create_user = %?;`,
OfflineThresholdInSec, user)
if err != nil {
return nil, err
}
defer terror.Call(rs.Close)
rows, err := sqlexec.DrainRecordSet(ctx, rs, 1)
if err != nil {
return nil, err
}
ret := make([]*JobInfo, 0, len(rows))
for _, row := range rows {
jobInfo, err := getJobInfo(row)
if err != nil {
return nil, err
}
ret = append(ret, jobInfo)
}
return ret, nil
}