Files
tidb/br/pkg/checkpoint/checkpoint.go
2023-07-04 07:33:44 +00:00

880 lines
24 KiB
Go

// Copyright 2022 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package checkpoint
import (
"bytes"
"context"
"crypto/sha256"
"encoding/base64"
"encoding/json"
"fmt"
"math/rand"
"strings"
"sync"
"time"
"github.com/pingcap/errors"
"github.com/pingcap/failpoint"
backuppb "github.com/pingcap/kvproto/pkg/brpb"
"github.com/pingcap/log"
"github.com/pingcap/tidb/br/pkg/logutil"
"github.com/pingcap/tidb/br/pkg/metautil"
"github.com/pingcap/tidb/br/pkg/rtree"
"github.com/pingcap/tidb/br/pkg/storage"
"github.com/pingcap/tidb/br/pkg/summary"
"github.com/pingcap/tidb/br/pkg/utils"
"github.com/tikv/client-go/v2/oracle"
"go.uber.org/zap"
"golang.org/x/sync/errgroup"
)
const CheckpointDir = "/checkpoints"
type flushPosition struct {
CheckpointDataDir string
CheckpointChecksumDir string
CheckpointLockPath string
}
const MaxChecksumTotalCost float64 = 60.0
const defaultTickDurationForFlush = 30 * time.Second
const defaultTckDurationForChecksum = 5 * time.Second
const defaultTickDurationForLock = 4 * time.Minute
const lockTimeToLive = 5 * time.Minute
type KeyType interface {
~BackupKeyType | ~RestoreKeyType
}
type RangeType struct {
*rtree.Range
}
func (r RangeType) IdentKey() []byte {
return r.StartKey
}
type ValueType interface {
IdentKey() []byte
}
type CheckpointMessage[K KeyType, V ValueType] struct {
// start-key of the origin range
GroupKey K
Group []V
}
// A Checkpoint Range File is like this:
//
// CheckpointData
// +----------------+ RangeGroupData RangeGroup
// | DureTime | +--------------------------+ encrypted +--------------------+
// | RangeGroupData-+---> | RangeGroupsEncriptedData-+----------> | GroupKey/TableID |
// | RangeGroupData | | Checksum | | Range |
// | ... | | CipherIv | | ... |
// | RangeGroupData | | Size | | Range |
// +----------------+ +--------------------------+ +--------------------+
//
// For restore, because there is no group key, so there is only one RangeGroupData
// with multi-ranges in the ChecksumData.
type RangeGroup[K KeyType, V ValueType] struct {
GroupKey K `json:"group-key"`
Group []V `json:"groups"`
}
type RangeGroupData struct {
RangeGroupsEncriptedData []byte
Checksum []byte
CipherIv []byte
Size int
}
type CheckpointData struct {
DureTime time.Duration `json:"dure-time"`
RangeGroupMetas []*RangeGroupData `json:"range-group-metas"`
}
// A Checkpoint Checksum File is like this:
//
// ChecksumInfo ChecksumItems ChecksumItem
// +------------+ +--------------+ +--------------+
// | Content--+--> | ChecksumItem-+---> | TableID |
// | Checksum | | ChecksumItem | | Crc64xor |
// | DureTime | | ... | | TotalKvs |
// +------------+ | ChecksumItem | | TotalBytes |
// +--------------+ +--------------+
type ChecksumItem struct {
TableID int64 `json:"table-id"`
Crc64xor uint64 `json:"crc64-xor"`
TotalKvs uint64 `json:"total-kvs"`
TotalBytes uint64 `json:"total-bytes"`
}
type ChecksumItems struct {
Items []*ChecksumItem `json:"checksum-items"`
}
type ChecksumInfo struct {
Content []byte `json:"content"`
Checksum []byte `json:"checksum"`
DureTime time.Duration `json:"dure-time"`
}
type GlobalTimer interface {
GetTS(context.Context) (int64, int64, error)
}
type CheckpointRunner[K KeyType, V ValueType] struct {
flushPosition
lockId uint64
meta map[K]*RangeGroup[K, V]
checksum ChecksumItems
valueMarshaler func(*RangeGroup[K, V]) ([]byte, error)
storage storage.ExternalStorage
cipher *backuppb.CipherInfo
timer GlobalTimer
appendCh chan *CheckpointMessage[K, V]
checksumCh chan *ChecksumItem
doneCh chan bool
metaCh chan map[K]*RangeGroup[K, V]
checksumMetaCh chan ChecksumItems
lockCh chan struct{}
errCh chan error
err error
errLock sync.RWMutex
wg sync.WaitGroup
}
func newCheckpointRunner[K KeyType, V ValueType](
ctx context.Context,
storage storage.ExternalStorage,
cipher *backuppb.CipherInfo,
timer GlobalTimer,
f flushPosition,
vm func(*RangeGroup[K, V]) ([]byte, error),
) *CheckpointRunner[K, V] {
return &CheckpointRunner[K, V]{
flushPosition: f,
meta: make(map[K]*RangeGroup[K, V]),
checksum: ChecksumItems{Items: make([]*ChecksumItem, 0)},
valueMarshaler: vm,
storage: storage,
cipher: cipher,
timer: timer,
appendCh: make(chan *CheckpointMessage[K, V]),
checksumCh: make(chan *ChecksumItem),
doneCh: make(chan bool, 1),
metaCh: make(chan map[K]*RangeGroup[K, V]),
checksumMetaCh: make(chan ChecksumItems),
lockCh: make(chan struct{}),
errCh: make(chan error, 1),
err: nil,
}
}
func (r *CheckpointRunner[K, V]) FlushChecksum(
ctx context.Context,
tableID int64,
crc64xor uint64,
totalKvs uint64,
totalBytes uint64,
) error {
checksumItem := &ChecksumItem{
TableID: tableID,
Crc64xor: crc64xor,
TotalKvs: totalKvs,
TotalBytes: totalBytes,
}
return r.FlushChecksumItem(ctx, checksumItem)
}
func (r *CheckpointRunner[K, V]) FlushChecksumItem(
ctx context.Context,
checksumItem *ChecksumItem,
) error {
select {
case <-ctx.Done():
return errors.Annotatef(ctx.Err(), "failed to append checkpoint checksum item")
case err, ok := <-r.errCh:
if !ok {
r.errLock.RLock()
err = r.err
r.errLock.RUnlock()
return errors.Annotate(err, "[checkpoint] Checksum: failed to append checkpoint checksum item")
}
return err
case r.checksumCh <- checksumItem:
return nil
}
}
func (r *CheckpointRunner[K, V]) Append(
ctx context.Context,
message *CheckpointMessage[K, V],
) error {
select {
case <-ctx.Done():
return errors.Annotatef(ctx.Err(), "failed to append checkpoint message")
case err, ok := <-r.errCh:
if !ok {
r.errLock.RLock()
err = r.err
r.errLock.RUnlock()
return errors.Annotate(err, "[checkpoint] Append: failed to append checkpoint message")
}
return err
case r.appendCh <- message:
return nil
}
}
// Note: Cannot be parallel with `Append` function
func (r *CheckpointRunner[K, V]) WaitForFinish(ctx context.Context, flush bool) {
if r.doneCh != nil {
select {
case r.doneCh <- flush:
default:
log.Warn("not the first close the checkpoint runner", zap.String("category", "checkpoint"))
}
}
// wait the range flusher exit
r.wg.Wait()
// remove the checkpoint lock
if r.lockId > 0 {
err := r.storage.DeleteFile(ctx, r.CheckpointLockPath)
if err != nil {
log.Warn("failed to remove the checkpoint lock", zap.Error(err))
}
}
}
// Send the checksum to the flush goroutine, and reset the CheckpointRunner's checksum
func (r *CheckpointRunner[K, V]) flushChecksum(ctx context.Context, errCh chan error) error {
checksum := ChecksumItems{
Items: r.checksum.Items,
}
r.checksum.Items = make([]*ChecksumItem, 0)
// do flush
select {
case <-ctx.Done():
return ctx.Err()
case err := <-errCh:
return err
case r.checksumMetaCh <- checksum:
}
return nil
}
// Send the meta to the flush goroutine, and reset the CheckpointRunner's meta
func (r *CheckpointRunner[K, V]) flushMeta(ctx context.Context, errCh chan error) error {
meta := r.meta
r.meta = make(map[K]*RangeGroup[K, V])
// do flush
select {
case <-ctx.Done():
return ctx.Err()
case err := <-errCh:
return err
case r.metaCh <- meta:
}
return nil
}
func (r *CheckpointRunner[K, V]) setLock(ctx context.Context, errCh chan error) error {
select {
case <-ctx.Done():
return ctx.Err()
case err := <-errCh:
return err
case r.lockCh <- struct{}{}:
}
return nil
}
// start a goroutine to flush the meta, which is sent from `checkpoint looper`, to the external storage
func (r *CheckpointRunner[K, V]) startCheckpointFlushLoop(ctx context.Context, wg *sync.WaitGroup) chan error {
errCh := make(chan error, 1)
wg.Add(1)
flushWorker := func(ctx context.Context, errCh chan error) {
defer wg.Done()
for {
select {
case <-ctx.Done():
if err := ctx.Err(); err != nil {
errCh <- err
}
return
case meta, ok := <-r.metaCh:
if !ok {
log.Info("stop checkpoint flush worker")
return
}
if err := r.doFlush(ctx, meta); err != nil {
errCh <- errors.Annotate(err, "failed to flush checkpoint data.")
return
}
case checksums, ok := <-r.checksumMetaCh:
if !ok {
log.Info("stop checkpoint flush worker")
return
}
if err := r.doChecksumFlush(ctx, checksums); err != nil {
errCh <- errors.Annotate(err, "failed to flush checkpoint checksum.")
return
}
case _, ok := <-r.lockCh:
if !ok {
log.Info("stop checkpoint flush worker")
return
}
if err := r.updateLock(ctx); err != nil {
errCh <- errors.Annotate(err, "failed to update checkpoint lock.")
return
}
}
}
}
go flushWorker(ctx, errCh)
return errCh
}
func (r *CheckpointRunner[K, V]) sendError(err error) {
select {
case r.errCh <- err:
log.Error("send the error", zap.String("category", "checkpoint"), zap.Error(err))
r.errLock.Lock()
r.err = err
r.errLock.Unlock()
close(r.errCh)
default:
log.Error("errCh is blocked", logutil.ShortError(err))
}
}
func (r *CheckpointRunner[K, V]) startCheckpointMainLoop(
ctx context.Context,
tickDurationForFlush,
tickDurationForChecksum,
tickDurationForLock time.Duration,
) {
failpoint.Inject("checkpoint-more-quickly-flush", func(_ failpoint.Value) {
tickDurationForChecksum = 1 * time.Second
tickDurationForFlush = 3 * time.Second
if tickDurationForLock > 0 {
tickDurationForLock = 1 * time.Second
}
log.Info("adjust the tick duration for flush or lock",
zap.Duration("flush", tickDurationForFlush),
zap.Duration("checksum", tickDurationForChecksum),
zap.Duration("lock", tickDurationForLock),
)
})
r.wg.Add(1)
checkpointLoop := func(ctx context.Context) {
defer r.wg.Done()
cctx, cancel := context.WithCancel(ctx)
defer cancel()
var wg sync.WaitGroup
errCh := r.startCheckpointFlushLoop(cctx, &wg)
flushTicker := time.NewTicker(tickDurationForFlush)
defer flushTicker.Stop()
checksumTicker := time.NewTicker(tickDurationForChecksum)
defer checksumTicker.Stop()
// register time ticker, the lock ticker is optional
lockTicker := dispatcherTicker(tickDurationForLock)
defer lockTicker.Stop()
for {
select {
case <-ctx.Done():
if err := ctx.Err(); err != nil {
r.sendError(err)
}
return
case <-lockTicker.Ch():
if err := r.setLock(ctx, errCh); err != nil {
r.sendError(err)
return
}
case <-checksumTicker.C:
if err := r.flushChecksum(ctx, errCh); err != nil {
r.sendError(err)
return
}
case <-flushTicker.C:
if err := r.flushMeta(ctx, errCh); err != nil {
r.sendError(err)
return
}
case msg := <-r.appendCh:
groups, exist := r.meta[msg.GroupKey]
if !exist {
groups = &RangeGroup[K, V]{
GroupKey: msg.GroupKey,
Group: make([]V, 0),
}
r.meta[msg.GroupKey] = groups
}
groups.Group = append(groups.Group, msg.Group...)
case msg := <-r.checksumCh:
r.checksum.Items = append(r.checksum.Items, msg)
case flush := <-r.doneCh:
log.Info("stop checkpoint runner")
if flush {
// NOTE: the exit step, don't send error any more.
if err := r.flushMeta(ctx, errCh); err != nil {
log.Error("failed to flush checkpoint meta", zap.Error(err))
} else if err := r.flushChecksum(ctx, errCh); err != nil {
log.Error("failed to flush checkpoint checksum", zap.Error(err))
}
}
// close the channel to flush worker
// and wait it to consumes all the metas
close(r.metaCh)
close(r.checksumMetaCh)
close(r.lockCh)
wg.Wait()
return
case err := <-errCh:
// pass flush worker's error back
r.sendError(err)
return
}
}
}
go checkpointLoop(ctx)
}
// flush the checksum to the external storage
func (r *CheckpointRunner[K, V]) doChecksumFlush(ctx context.Context, checksumItems ChecksumItems) error {
if len(checksumItems.Items) == 0 {
return nil
}
content, err := json.Marshal(checksumItems)
if err != nil {
return errors.Trace(err)
}
checksum := sha256.Sum256(content)
checksumInfo := &ChecksumInfo{
Content: content,
Checksum: checksum[:],
DureTime: summary.NowDureTime(),
}
data, err := json.Marshal(checksumInfo)
if err != nil {
return errors.Trace(err)
}
fname := fmt.Sprintf("%s/t%d_and__.cpt", r.CheckpointChecksumDir, checksumItems.Items[0].TableID)
if err = r.storage.WriteFile(ctx, fname, data); err != nil {
return errors.Annotatef(err, "failed to write file %s for checkpoint checksum", fname)
}
failpoint.Inject("failed-after-checkpoint-flushes-checksum", func(_ failpoint.Value) {
failpoint.Return(errors.Errorf("failpoint: failed after checkpoint flushes checksum"))
})
return nil
}
// flush the meta to the external storage
func (r *CheckpointRunner[K, V]) doFlush(ctx context.Context, meta map[K]*RangeGroup[K, V]) error {
if len(meta) == 0 {
return nil
}
checkpointData := &CheckpointData{
DureTime: summary.NowDureTime(),
RangeGroupMetas: make([]*RangeGroupData, 0, len(meta)),
}
var fname []byte = nil
for _, group := range meta {
if len(group.Group) == 0 {
continue
}
// use the first item's group-key and sub-range-key as the filename
if len(fname) == 0 {
fname = append([]byte(fmt.Sprint(group.GroupKey, '.', '.')), group.Group[0].IdentKey()...)
}
// Flush the metaFile to storage
content, err := r.valueMarshaler(group)
if err != nil {
return errors.Trace(err)
}
encryptBuff, iv, err := metautil.Encrypt(content, r.cipher)
if err != nil {
return errors.Trace(err)
}
checksum := sha256.Sum256(content)
checkpointData.RangeGroupMetas = append(checkpointData.RangeGroupMetas, &RangeGroupData{
RangeGroupsEncriptedData: encryptBuff,
Checksum: checksum[:],
Size: len(content),
CipherIv: iv,
})
}
if len(checkpointData.RangeGroupMetas) > 0 {
data, err := json.Marshal(checkpointData)
if err != nil {
return errors.Trace(err)
}
checksum := sha256.Sum256(fname)
checksumEncoded := base64.URLEncoding.EncodeToString(checksum[:])
path := fmt.Sprintf("%s/%s_%d.cpt", r.CheckpointDataDir, checksumEncoded, rand.Uint64())
if err := r.storage.WriteFile(ctx, path, data); err != nil {
return errors.Trace(err)
}
}
failpoint.Inject("failed-after-checkpoint-flushes", func(_ failpoint.Value) {
failpoint.Return(errors.Errorf("failpoint: failed after checkpoint flushes"))
})
return nil
}
type CheckpointLock struct {
LockId uint64 `json:"lock-id"`
ExpireAt int64 `json:"expire-at"`
}
// get ts with retry
func (r *CheckpointRunner[K, V]) getTS(ctx context.Context) (int64, int64, error) {
var (
p int64 = 0
l int64 = 0
retry int = 0
)
errRetry := utils.WithRetry(ctx, func() error {
var err error
p, l, err = r.timer.GetTS(ctx)
if err != nil {
retry++
log.Info("failed to get ts", zap.Int("retry", retry), zap.Error(err))
return err
}
return nil
}, utils.NewPDReqBackoffer())
return p, l, errors.Trace(errRetry)
}
// flush the lock to the external storage
func (r *CheckpointRunner[K, V]) flushLock(ctx context.Context, p int64) error {
lock := &CheckpointLock{
LockId: r.lockId,
ExpireAt: p + lockTimeToLive.Milliseconds(),
}
log.Info("start to flush the checkpoint lock", zap.Int64("lock-at", p),
zap.Int64("expire-at", lock.ExpireAt))
data, err := json.Marshal(lock)
if err != nil {
return errors.Trace(err)
}
err = r.storage.WriteFile(ctx, r.CheckpointLockPath, data)
return errors.Trace(err)
}
// check whether this lock belongs to this BR
func (r *CheckpointRunner[K, V]) checkLockFile(ctx context.Context, now int64) error {
data, err := r.storage.ReadFile(ctx, r.CheckpointLockPath)
if err != nil {
return errors.Trace(err)
}
lock := &CheckpointLock{}
err = json.Unmarshal(data, lock)
if err != nil {
return errors.Trace(err)
}
if lock.ExpireAt <= now {
if lock.LockId > r.lockId {
return errors.Errorf("There are another BR(%d) running after but setting lock before this one(%d). "+
"Please check whether the BR is running. If not, you can retry.", lock.LockId, r.lockId)
}
if lock.LockId == r.lockId {
log.Warn("The lock has expired.",
zap.Int64("expire-at(ms)", lock.ExpireAt), zap.Int64("now(ms)", now))
}
} else if lock.LockId != r.lockId {
return errors.Errorf("The existing lock will expire in %d seconds. "+
"There may be another BR(%d) running. If not, you can wait for the lock to expire, "+
"or delete the file `%s%s` manually.",
(lock.ExpireAt-now)/1000, lock.LockId, strings.TrimRight(r.storage.URI(), "/"), r.CheckpointLockPath)
}
return nil
}
// generate a new lock and flush the lock to the external storage
func (r *CheckpointRunner[K, V]) updateLock(ctx context.Context) error {
p, _, err := r.getTS(ctx)
if err != nil {
return errors.Trace(err)
}
if err = r.checkLockFile(ctx, p); err != nil {
return errors.Trace(err)
}
if err = r.flushLock(ctx, p); err != nil {
return errors.Trace(err)
}
failpoint.Inject("failed-after-checkpoint-updates-lock", func(_ failpoint.Value) {
failpoint.Return(errors.Errorf("failpoint: failed after checkpoint updates lock"))
})
return nil
}
// Attempt to initialize the lock. Need to stop the backup when there is an unexpired locks.
func (r *CheckpointRunner[K, V]) initialLock(ctx context.Context) error {
p, l, err := r.getTS(ctx)
if err != nil {
return errors.Trace(err)
}
r.lockId = oracle.ComposeTS(p, l)
exist, err := r.storage.FileExists(ctx, r.CheckpointLockPath)
if err != nil {
return errors.Trace(err)
}
if exist {
if err := r.checkLockFile(ctx, p); err != nil {
return errors.Trace(err)
}
}
if err = r.flushLock(ctx, p); err != nil {
return errors.Trace(err)
}
// wait for 3 seconds to check whether the lock file is overwritten by another BR
time.Sleep(3 * time.Second)
err = r.checkLockFile(ctx, p)
return errors.Trace(err)
}
// walk the whole checkpoint range files and retrieve the metadata of backed up/restored ranges
// and return the total time cost in the past executions
func walkCheckpointFile[K KeyType, V ValueType](
ctx context.Context,
s storage.ExternalStorage,
cipher *backuppb.CipherInfo,
subDir string,
fn func(groupKey K, value V),
) (time.Duration, error) {
// records the total time cost in the past executions
var pastDureTime time.Duration = 0
err := s.WalkDir(ctx, &storage.WalkOption{SubDir: subDir}, func(path string, size int64) error {
if strings.HasSuffix(path, ".cpt") {
content, err := s.ReadFile(ctx, path)
if err != nil {
return errors.Trace(err)
}
checkpointData := &CheckpointData{}
if err = json.Unmarshal(content, checkpointData); err != nil {
log.Error("failed to unmarshal the checkpoint data info, skip it", zap.Error(err))
return nil
}
if checkpointData.DureTime > pastDureTime {
pastDureTime = checkpointData.DureTime
}
for _, meta := range checkpointData.RangeGroupMetas {
decryptContent, err := metautil.Decrypt(meta.RangeGroupsEncriptedData, cipher, meta.CipherIv)
if err != nil {
return errors.Trace(err)
}
checksum := sha256.Sum256(decryptContent)
if !bytes.Equal(meta.Checksum, checksum[:]) {
log.Error("checkpoint checksum info's checksum mismatch, skip it",
zap.ByteString("expect", meta.Checksum),
zap.ByteString("got", checksum[:]),
)
continue
}
group := &RangeGroup[K, V]{}
if err = json.Unmarshal(decryptContent, group); err != nil {
return errors.Trace(err)
}
for _, g := range group.Group {
fn(group.GroupKey, g)
}
}
}
return nil
})
return pastDureTime, errors.Trace(err)
}
// load checkpoint meta data from external storage and unmarshal back
func loadCheckpointMeta[T any](ctx context.Context, s storage.ExternalStorage, path string, m *T) error {
data, err := s.ReadFile(ctx, path)
if err != nil {
return errors.Trace(err)
}
err = json.Unmarshal(data, m)
return errors.Trace(err)
}
// walk the whole checkpoint checksum files and retrieve checksum information of tables calculated
func loadCheckpointChecksum(
ctx context.Context,
s storage.ExternalStorage,
subDir string,
) (map[int64]*ChecksumItem, time.Duration, error) {
var pastDureTime time.Duration = 0
checkpointChecksum := make(map[int64]*ChecksumItem)
err := s.WalkDir(ctx, &storage.WalkOption{SubDir: subDir}, func(path string, size int64) error {
data, err := s.ReadFile(ctx, path)
if err != nil {
return errors.Trace(err)
}
info := &ChecksumInfo{}
err = json.Unmarshal(data, info)
if err != nil {
log.Error("failed to unmarshal the checkpoint checksum info, skip it", zap.Error(err))
return nil
}
checksum := sha256.Sum256(info.Content)
if !bytes.Equal(info.Checksum, checksum[:]) {
log.Error("checkpoint checksum info's checksum mismatch, skip it",
zap.ByteString("expect", info.Checksum),
zap.ByteString("got", checksum[:]),
)
return nil
}
if info.DureTime > pastDureTime {
pastDureTime = info.DureTime
}
items := &ChecksumItems{}
err = json.Unmarshal(info.Content, items)
if err != nil {
return errors.Trace(err)
}
for _, c := range items.Items {
checkpointChecksum[c.TableID] = c
}
return nil
})
return checkpointChecksum, pastDureTime, errors.Trace(err)
}
func saveCheckpointMetadata[T any](ctx context.Context, s storage.ExternalStorage, meta *T, path string) error {
data, err := json.Marshal(meta)
if err != nil {
return errors.Trace(err)
}
err = s.WriteFile(ctx, path, data)
return errors.Trace(err)
}
func removeCheckpointData(ctx context.Context, s storage.ExternalStorage, subDir string) error {
var (
// Generate one file every 30 seconds, so there are only 1200 files in 10 hours.
removedFileNames = make([]string, 0, 1200)
removeCnt int = 0
removeSize int64 = 0
)
err := s.WalkDir(ctx, &storage.WalkOption{SubDir: subDir}, func(path string, size int64) error {
if !strings.HasSuffix(path, ".cpt") && !strings.HasSuffix(path, ".meta") && !strings.HasSuffix(path, ".lock") {
return nil
}
removedFileNames = append(removedFileNames, path)
removeCnt += 1
removeSize += size
return nil
})
if err != nil {
return errors.Trace(err)
}
log.Info("start to remove checkpoint data",
zap.String("checkpoint task", subDir),
zap.Int("remove-count", removeCnt),
zap.Int64("remove-size", removeSize),
)
maxFailedFilesNum := 16
failedFilesCount := struct {
lock sync.Mutex
count int
}{
count: 0,
}
pool := utils.NewWorkerPool(4, "checkpoint remove worker")
eg, gCtx := errgroup.WithContext(ctx)
for _, filename := range removedFileNames {
name := filename
pool.ApplyOnErrorGroup(eg, func() error {
if err := s.DeleteFile(gCtx, name); err != nil {
log.Warn("failed to remove the file", zap.String("filename", name), zap.Error(err))
failedFilesCount.lock.Lock()
failedFilesCount.count += 1
if failedFilesCount.count >= maxFailedFilesNum {
failedFilesCount.lock.Unlock()
return errors.Annotate(err, "failed to delete too many files")
}
failedFilesCount.lock.Unlock()
}
return nil
})
}
if err := eg.Wait(); err != nil {
return errors.Trace(err)
}
log.Info("all the checkpoint data has been removed", zap.String("checkpoint task", subDir))
return nil
}