Files
tidb/br/pkg/task/operator/cmd.go

188 lines
5.3 KiB
Go

// Copyright 2023 PingCAP, Inc. Licensed under Apache-2.0.
package operator
import (
"context"
"crypto/tls"
"strings"
"sync"
"time"
"github.com/pingcap/errors"
"github.com/pingcap/log"
"github.com/pingcap/tidb/br/pkg/logutil"
"github.com/pingcap/tidb/br/pkg/pdutil"
"github.com/pingcap/tidb/br/pkg/task"
"github.com/pingcap/tidb/br/pkg/utils"
"go.uber.org/zap"
"golang.org/x/sync/errgroup"
"google.golang.org/grpc/keepalive"
)
func dialPD(ctx context.Context, cfg *task.Config) (*pdutil.PdController, error) {
pdAddrs := strings.Join(cfg.PD, ",")
var tc *tls.Config
if cfg.TLS.IsEnabled() {
var err error
tc, err = cfg.TLS.ToTLSConfig()
if err != nil {
return nil, err
}
}
mgr, err := pdutil.NewPdController(ctx, pdAddrs, tc, cfg.TLS.ToPDSecurityOption())
if err != nil {
return nil, err
}
return mgr, nil
}
func (cx *AdaptEnvForSnapshotBackupContext) cleanUpWith(f func(ctx context.Context)) {
_ = cx.cleanUpWithErr(func(ctx context.Context) error { f(ctx); return nil })
}
func (cx *AdaptEnvForSnapshotBackupContext) cleanUpWithErr(f func(ctx context.Context) error) error {
ctx, cancel := context.WithTimeout(context.Background(), cx.cfg.TTL)
defer cancel()
return f(ctx)
}
type AdaptEnvForSnapshotBackupContext struct {
context.Context
pdMgr *pdutil.PdController
kvMgr *utils.StoreManager
cfg PauseGcConfig
rdGrp sync.WaitGroup
runGrp *errgroup.Group
}
func (cx *AdaptEnvForSnapshotBackupContext) ReadyL(name string, notes ...zap.Field) {
logutil.CL(cx).Info("Stage ready.", append(notes, zap.String("component", name))...)
cx.rdGrp.Done()
}
func hintAllReady() {
// Hacking: some version of operators using the follow two logs to check whether we are ready...
log.Info("Schedulers are paused.")
log.Info("GC is paused.")
log.Info("All ready.")
}
// AdaptEnvForSnapshotBackup blocks the current goroutine and pause the GC safepoint and remove the scheduler by the config.
// This function will block until the context being canceled.
func AdaptEnvForSnapshotBackup(ctx context.Context, cfg *PauseGcConfig) error {
mgr, err := dialPD(ctx, &cfg.Config)
if err != nil {
return errors.Annotate(err, "failed to dial PD")
}
var tconf *tls.Config
if cfg.TLS.IsEnabled() {
tconf, err = cfg.TLS.ToTLSConfig()
if err != nil {
return errors.Annotate(err, "invalid tls config")
}
}
kvMgr := utils.NewStoreManager(mgr.GetPDClient(), keepalive.ClientParameters{
Time: cfg.Config.GRPCKeepaliveTime,
Timeout: cfg.Config.GRPCKeepaliveTimeout,
}, tconf)
eg, ectx := errgroup.WithContext(ctx)
cx := &AdaptEnvForSnapshotBackupContext{
Context: logutil.ContextWithField(ectx, zap.String("tag", "br_operator")),
pdMgr: mgr,
kvMgr: kvMgr,
cfg: *cfg,
rdGrp: sync.WaitGroup{},
runGrp: eg,
}
cx.rdGrp.Add(3)
eg.Go(func() error { return pauseGCKeeper(cx) })
eg.Go(func() error { return pauseSchedulerKeeper(cx) })
eg.Go(func() error { return pauseImporting(cx) })
go func() {
cx.rdGrp.Wait()
hintAllReady()
}()
return eg.Wait()
}
func pauseImporting(cx *AdaptEnvForSnapshotBackupContext) error {
denyLightning := utils.NewSuspendImporting("prepare_for_snapshot_backup", cx.kvMgr)
if _, err := denyLightning.DenyAllStores(cx, cx.cfg.TTL); err != nil {
return errors.Trace(err)
}
cx.ReadyL("pause_lightning")
cx.runGrp.Go(func() error {
err := denyLightning.Keeper(cx, cx.cfg.TTL)
if errors.Cause(err) != context.Canceled {
logutil.CL(cx).Warn("keeper encounters error.", logutil.ShortError(err))
}
return cx.cleanUpWithErr(func(ctx context.Context) error {
for {
if ctx.Err() != nil {
return errors.Annotate(ctx.Err(), "cleaning up timed out")
}
res, err := denyLightning.AllowAllStores(ctx)
if err != nil {
logutil.CL(ctx).Warn("Failed to restore lightning, will retry.", logutil.ShortError(err))
// Retry for 10 times.
time.Sleep(cx.cfg.TTL / 10)
continue
}
return denyLightning.ConsistentWithPrev(res)
}
})
})
return nil
}
func pauseGCKeeper(ctx *AdaptEnvForSnapshotBackupContext) error {
// Note: should we remove the service safepoint as soon as this exits?
sp := utils.BRServiceSafePoint{
ID: utils.MakeSafePointID(),
TTL: int64(ctx.cfg.TTL.Seconds()),
BackupTS: ctx.cfg.SafePoint,
}
if sp.BackupTS == 0 {
rts, err := ctx.pdMgr.GetMinResolvedTS(ctx)
if err != nil {
return err
}
logutil.CL(ctx).Info("No service safepoint provided, using the minimal resolved TS.", zap.Uint64("min-resolved-ts", rts))
sp.BackupTS = rts
}
err := utils.StartServiceSafePointKeeper(ctx, ctx.pdMgr.GetPDClient(), sp)
if err != nil {
return err
}
ctx.ReadyL("pause_gc", zap.Object("safepoint", sp))
// Note: in fact we can directly return here.
// But the name `keeper` implies once the function exits,
// the GC should be resume, so let's block here.
<-ctx.Done()
return nil
}
func pauseSchedulerKeeper(ctx *AdaptEnvForSnapshotBackupContext) error {
undo, err := ctx.pdMgr.RemoveAllPDSchedulers(ctx)
if undo != nil {
defer ctx.cleanUpWith(func(ctx context.Context) {
if err := undo(ctx); err != nil {
log.Warn("failed to restore pd scheduler.", logutil.ShortError(err))
}
})
}
if err != nil {
return err
}
ctx.ReadyL("pause_scheduler")
// Wait until the context canceled.
// So we can properly do the clean up work.
<-ctx.Done()
return nil
}