From 794002cf45178522018dfd408ef4db1b51effcfa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B1=B1=E5=B2=9A?= <36239017+YuJuncen@users.noreply.github.com> Date: Mon, 5 Jun 2023 15:59:41 +0800 Subject: [PATCH] ebs_backup: added retry for exceeding of quota (#44328) close pingcap/tidb#44325 --- br/pkg/aws/BUILD.bazel | 1 + br/pkg/aws/ebs.go | 24 +++++++++++++++++++++++- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/br/pkg/aws/BUILD.bazel b/br/pkg/aws/BUILD.bazel index 2930569987..2b70183655 100644 --- a/br/pkg/aws/BUILD.bazel +++ b/br/pkg/aws/BUILD.bazel @@ -10,6 +10,7 @@ go_library( "//br/pkg/glue", "//br/pkg/utils", "@com_github_aws_aws_sdk_go//aws", + "@com_github_aws_aws_sdk_go//aws/awserr", "@com_github_aws_aws_sdk_go//aws/session", "@com_github_aws_aws_sdk_go//service/ec2", "@com_github_aws_aws_sdk_go//service/ec2/ec2iface", diff --git a/br/pkg/aws/ebs.go b/br/pkg/aws/ebs.go index 4e4436cf11..944acfc558 100644 --- a/br/pkg/aws/ebs.go +++ b/br/pkg/aws/ebs.go @@ -10,6 +10,7 @@ import ( "time" "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/aws/awserr" "github.com/aws/aws-sdk-go/aws/session" "github.com/aws/aws-sdk-go/service/ec2" "github.com/aws/aws-sdk-go/service/ec2/ec2iface" @@ -28,6 +29,9 @@ const ( AnnTemporaryVolumeID string = "temporary/volume-id" EC2K8SClusterNameKey string = "aws:eks:cluster-name" + pollingPendingSnapshotInterval = 30 * time.Second + errCodeTooManyPendingSnapshots = "PendingSnapshotLimitExceeded" + SourcePvcNameKey string = "source/pvcName" SourceVolumeIdKey string = "source/VolumeId" SourceTikvNameKey string = "source/TikvName" @@ -202,7 +206,7 @@ func (e *EC2Session) CreateSnapshots(backupInfo *config.EBSBasedBRMeta) (map[str createSnapshotInput.SetInstanceSpecification(&instanceSpecification) - resp, err := e.ec2.CreateSnapshots(&createSnapshotInput) + resp, err := e.createSnapshotsWithRetry(context.TODO(), &createSnapshotInput) if err != nil { return errors.Trace(err) } @@ -233,6 +237,24 @@ func (e *EC2Session) CreateSnapshots(backupInfo *config.EBSBasedBRMeta) (map[str return snapIDMap, volAZs, nil } +func (e *EC2Session) createSnapshotsWithRetry(ctx context.Context, input *ec2.CreateSnapshotsInput) (*ec2.CreateSnapshotsOutput, error) { + for { + res, err := e.ec2.CreateSnapshotsWithContext(ctx, input) + if aerr, ok := err.(awserr.Error); ok && aerr.Code() == errCodeTooManyPendingSnapshots { + log.Warn("the pending snapshots exceeds the limit. waiting...", + zap.String("instance", aws.StringValue(input.InstanceSpecification.InstanceId)), + zap.Strings("volumns", aws.StringValueSlice(input.InstanceSpecification.ExcludeDataVolumeIds)), + ) + time.Sleep(pollingPendingSnapshotInterval) + continue + } + if err != nil { + return nil, errors.Annotatef(err, "failed to create snapshot for request %s", input) + } + return res, nil + } +} + func (e *EC2Session) extractSnapProgress(str *string) int64 { if str == nil { return 0