337 lines
10 KiB
Go
337 lines
10 KiB
Go
// Copyright 2020 PingCAP, Inc. Licensed under Apache-2.0.
|
|
|
|
package utils
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/pingcap/errors"
|
|
backuppb "github.com/pingcap/kvproto/pkg/brpb"
|
|
"github.com/pingcap/log"
|
|
tmysql "github.com/pingcap/tidb/pkg/errno"
|
|
"github.com/pingcap/tidb/pkg/parser/terror"
|
|
"github.com/tikv/client-go/v2/tikv"
|
|
"go.uber.org/multierr"
|
|
"go.uber.org/zap"
|
|
)
|
|
|
|
var retryableServerError = []string{
|
|
"server closed",
|
|
"connection refused",
|
|
"connection reset by peer",
|
|
"channel closed",
|
|
"error trying to connect",
|
|
"connection closed before message completed",
|
|
"body write aborted",
|
|
"error during dispatch",
|
|
"put object timeout",
|
|
"internalerror",
|
|
"not read from or written to within the timeout period",
|
|
"<code>requesttimeout</code>",
|
|
"<code>invalidpart</code>",
|
|
"end of file before message length reached",
|
|
}
|
|
|
|
type ErrorResult struct {
|
|
Strategy ErrorStrategy
|
|
Reason string
|
|
}
|
|
|
|
type ErrorStrategy int
|
|
|
|
const (
|
|
// This type can be retry but consume the backoffer attempts.
|
|
RetryStrategy ErrorStrategy = iota
|
|
// This type means unrecoverable error and the whole progress should exits
|
|
// for example:
|
|
// 1. permission not valid.
|
|
// 2. data has not found.
|
|
// 3. retry too many times
|
|
GiveUpStrategy
|
|
// This type represents Unknown error
|
|
UnknownStrategy
|
|
)
|
|
|
|
type ErrorContext struct {
|
|
mu sync.Mutex
|
|
// encounter times for one context on a store
|
|
// we may use this value to determine the retry policy
|
|
encounterTimes map[uint64]int
|
|
// unknown error retry limitation.
|
|
// encouter many times error makes Retry to GiveUp.
|
|
encounterTimesLimitation int
|
|
// whether in backup or restore
|
|
scenario string
|
|
}
|
|
|
|
func NewErrorContext(scenario string, limitation int) *ErrorContext {
|
|
return &ErrorContext{
|
|
scenario: scenario,
|
|
encounterTimes: make(map[uint64]int),
|
|
encounterTimesLimitation: limitation,
|
|
}
|
|
}
|
|
|
|
func NewDefaultContext() *ErrorContext {
|
|
return &ErrorContext{
|
|
scenario: "default",
|
|
encounterTimes: make(map[uint64]int),
|
|
encounterTimesLimitation: 1,
|
|
}
|
|
}
|
|
|
|
func (ec *ErrorContext) HandleError(err *backuppb.Error, uuid uint64) ErrorResult {
|
|
if err == nil {
|
|
return ErrorResult{RetryStrategy, "unreachable retry"}
|
|
}
|
|
res := ec.handleErrorPb(err, uuid)
|
|
// try the best effort to save progress from error here
|
|
if res.Strategy == UnknownStrategy && len(err.Msg) != 0 {
|
|
return ec.HandleErrorMsg(err.Msg, uuid)
|
|
}
|
|
return res
|
|
}
|
|
|
|
func (ec *ErrorContext) HandleIgnorableError(err *backuppb.Error, uuid uint64) ErrorResult {
|
|
if err == nil {
|
|
return ErrorResult{RetryStrategy, "unreachable retry"}
|
|
}
|
|
res := ec.handleIgnorableErrorPb(err, uuid)
|
|
// try the best effort to save progress from error here
|
|
if res.Strategy == UnknownStrategy && len(err.Msg) != 0 {
|
|
return ec.HandleErrorMsg(err.Msg, uuid)
|
|
}
|
|
return res
|
|
}
|
|
|
|
func (ec *ErrorContext) HandleErrorMsg(msg string, uuid uint64) ErrorResult {
|
|
// UNSAFE! TODO: use meaningful error code instead of unstructured message to find failed to write error.
|
|
logger := log.L().With(zap.String("scenario", ec.scenario))
|
|
if messageIsNotFoundStorageError(msg) {
|
|
reason := fmt.Sprintf("File or directory not found on TiKV Node (store id: %v). "+
|
|
"work around:please ensure br and tikv nodes share a same storage and the user of br and tikv has same uid.",
|
|
uuid)
|
|
return ErrorResult{GiveUpStrategy, reason}
|
|
}
|
|
if messageIsPermissionDeniedStorageError(msg) {
|
|
reason := fmt.Sprintf("I/O permission denied error occurs on TiKV Node(store id: %v). "+
|
|
"work around:please ensure tikv has permission to read from & write to the storage.",
|
|
uuid)
|
|
return ErrorResult{GiveUpStrategy, reason}
|
|
}
|
|
msgLower := strings.ToLower(msg)
|
|
if strings.Contains(msgLower, "context canceled") {
|
|
return ErrorResult{GiveUpStrategy, "context canceled, give up"}
|
|
}
|
|
|
|
if MessageIsRetryableStorageError(msg) {
|
|
logger.Warn("occur storage error", zap.String("error", msg))
|
|
return ErrorResult{RetryStrategy, "retrable error"}
|
|
}
|
|
// retry enough on same store
|
|
ec.mu.Lock()
|
|
defer ec.mu.Unlock()
|
|
ec.encounterTimes[uuid]++
|
|
if ec.encounterTimes[uuid] <= ec.encounterTimesLimitation {
|
|
return ErrorResult{RetryStrategy, "unknown error, retry it for few times"}
|
|
}
|
|
return ErrorResult{GiveUpStrategy, "unknown error and retry too many times, give up"}
|
|
}
|
|
|
|
func (ec *ErrorContext) handleIgnorableErrorPb(e *backuppb.Error, uuid uint64) ErrorResult {
|
|
switch e.Detail.(type) {
|
|
case *backuppb.Error_KvError:
|
|
return ErrorResult{RetryStrategy, "retry outside because the error can be ignored"}
|
|
case *backuppb.Error_RegionError:
|
|
return ErrorResult{RetryStrategy, "retry outside because the error can be ignored"}
|
|
case *backuppb.Error_ClusterIdError:
|
|
return ErrorResult{GiveUpStrategy, "cluster ID mismatch"}
|
|
}
|
|
return ErrorResult{UnknownStrategy, "unreachable code"}
|
|
}
|
|
|
|
func (ec *ErrorContext) handleErrorPb(e *backuppb.Error, uuid uint64) ErrorResult {
|
|
logger := log.L().With(zap.String("scenario", ec.scenario))
|
|
switch v := e.Detail.(type) {
|
|
case *backuppb.Error_KvError:
|
|
// should not meet error other than KeyLocked.
|
|
return ErrorResult{GiveUpStrategy, "unknown kv error"}
|
|
|
|
case *backuppb.Error_RegionError:
|
|
regionErr := v.RegionError
|
|
// Ignore following errors.
|
|
if !(regionErr.EpochNotMatch != nil ||
|
|
regionErr.NotLeader != nil ||
|
|
regionErr.RegionNotFound != nil ||
|
|
regionErr.ServerIsBusy != nil ||
|
|
regionErr.StaleCommand != nil ||
|
|
regionErr.StoreNotMatch != nil ||
|
|
regionErr.ReadIndexNotReady != nil ||
|
|
regionErr.ProposalInMergingMode != nil) {
|
|
logger.Error("unexpect region error", zap.Reflect("RegionError", regionErr))
|
|
return ErrorResult{GiveUpStrategy, "unknown kv error"}
|
|
}
|
|
logger.Warn("occur region error",
|
|
zap.Reflect("RegionError", regionErr),
|
|
zap.Uint64("uuid", uuid))
|
|
return ErrorResult{RetryStrategy, "retrable error"}
|
|
|
|
case *backuppb.Error_ClusterIdError:
|
|
logger.Error("occur cluster ID error", zap.Reflect("error", v), zap.Uint64("uuid", uuid))
|
|
return ErrorResult{GiveUpStrategy, "cluster ID mismatch"}
|
|
}
|
|
return ErrorResult{UnknownStrategy, "unreachable code"}
|
|
}
|
|
|
|
// RetryableFunc presents a retryable operation.
|
|
type RetryableFunc func() error
|
|
|
|
type RetryableFuncV2[T any] func(context.Context) (T, error)
|
|
|
|
// Backoffer implements a backoff policy for retrying operations.
|
|
type Backoffer interface {
|
|
// NextBackoff returns a duration to wait before retrying again
|
|
NextBackoff(err error) time.Duration
|
|
// Attempt returns the remain attempt times
|
|
Attempt() int
|
|
}
|
|
|
|
// WithRetry retries a given operation with a backoff policy.
|
|
//
|
|
// Returns nil if `retryableFunc` succeeded at least once. Otherwise, returns a
|
|
// multierr containing all errors encountered.
|
|
func WithRetry(
|
|
ctx context.Context,
|
|
retryableFunc RetryableFunc,
|
|
backoffer Backoffer,
|
|
) error {
|
|
_, err := WithRetryV2[struct{}](ctx, backoffer, func(ctx context.Context) (struct{}, error) {
|
|
innerErr := retryableFunc()
|
|
return struct{}{}, innerErr
|
|
})
|
|
return err
|
|
}
|
|
|
|
// WithRetryV2 retries a given operation with a backoff policy.
|
|
//
|
|
// Returns the returned value if `retryableFunc` succeeded at least once. Otherwise, returns a
|
|
// multierr that containing all errors encountered.
|
|
// Comparing with `WithRetry`, this function reordered the argument order and supports catching the return value.
|
|
func WithRetryV2[T any](
|
|
ctx context.Context,
|
|
backoffer Backoffer,
|
|
fn RetryableFuncV2[T],
|
|
) (T, error) {
|
|
var allErrors error
|
|
for backoffer.Attempt() > 0 {
|
|
res, err := fn(ctx)
|
|
if err == nil {
|
|
return res, nil
|
|
}
|
|
allErrors = multierr.Append(allErrors, err)
|
|
select {
|
|
case <-ctx.Done():
|
|
return *new(T), allErrors
|
|
case <-time.After(backoffer.NextBackoff(err)):
|
|
}
|
|
}
|
|
return *new(T), allErrors // nolint:wrapcheck
|
|
}
|
|
|
|
// MessageIsRetryableStorageError checks whether the message returning from TiKV is retryable ExternalStorageError.
|
|
func MessageIsRetryableStorageError(msg string) bool {
|
|
msgLower := strings.ToLower(msg)
|
|
// UNSAFE! TODO: Add a error type for retryable connection error.
|
|
for _, errStr := range retryableServerError {
|
|
if strings.Contains(msgLower, errStr) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func FallBack2CreateTable(err error) bool {
|
|
switch nerr := errors.Cause(err).(type) {
|
|
case *terror.Error:
|
|
return nerr.Code() == tmysql.ErrInvalidDDLJob
|
|
}
|
|
return false
|
|
}
|
|
|
|
// RetryWithBackoffer is a simple context for a "mixed" retry.
|
|
// Some of TiDB APIs, say, `ResolveLock` requires a `tikv.Backoffer` as argument.
|
|
// But the `tikv.Backoffer` isn't pretty customizable, it has some sorts of predefined configuration but
|
|
// we cannot create new one. So we are going to mix up the flavour of `tikv.Backoffer` and our homemade
|
|
// back off strategy. That is what the `RetryWithBackoffer` did.
|
|
type RetryWithBackoffer struct {
|
|
bo *tikv.Backoffer
|
|
|
|
totalBackoff int
|
|
maxBackoff int
|
|
baseErr error
|
|
|
|
mu sync.Mutex
|
|
nextBackoff int
|
|
}
|
|
|
|
// AdaptTiKVBackoffer creates an "ad-hoc" backoffer, which wraps a backoffer and provides some new functions:
|
|
// When backing off, we can manually provide it a specified sleep duration instead of directly provide a retry.Config
|
|
// Which is sealed in the "client-go/internal".
|
|
func AdaptTiKVBackoffer(ctx context.Context, maxSleepMs int, baseErr error) *RetryWithBackoffer {
|
|
return &RetryWithBackoffer{
|
|
bo: tikv.NewBackoffer(ctx, maxSleepMs),
|
|
maxBackoff: maxSleepMs,
|
|
baseErr: baseErr,
|
|
}
|
|
}
|
|
|
|
// NextSleepInMS returns the time `BackOff` will sleep in ms of the state.
|
|
func (r *RetryWithBackoffer) NextSleepInMS() int {
|
|
r.mu.Lock()
|
|
defer r.mu.Unlock()
|
|
return r.nextBackoff
|
|
}
|
|
|
|
// TotalSleepInMS returns the total sleeped time in ms.
|
|
func (r *RetryWithBackoffer) TotalSleepInMS() int {
|
|
return r.totalBackoff + r.bo.GetTotalSleep()
|
|
}
|
|
|
|
// MaxSleepInMS returns the max sleep time for the retry context in ms.
|
|
func (r *RetryWithBackoffer) MaxSleepInMS() int {
|
|
return r.maxBackoff
|
|
}
|
|
|
|
// BackOff executes the back off: sleep for a precalculated backoff time.
|
|
// See `RequestBackOff` for more details.
|
|
func (r *RetryWithBackoffer) BackOff() error {
|
|
r.mu.Lock()
|
|
nextBo := r.nextBackoff
|
|
r.nextBackoff = 0
|
|
r.mu.Unlock()
|
|
|
|
if r.TotalSleepInMS() > r.maxBackoff {
|
|
return errors.Annotatef(r.baseErr, "backoff exceeds the max backoff time %s", time.Duration(r.maxBackoff)*time.Millisecond)
|
|
}
|
|
time.Sleep(time.Duration(nextBo) * time.Millisecond)
|
|
r.totalBackoff += nextBo
|
|
return nil
|
|
}
|
|
|
|
// RequestBackOff register the intent of backing off at least n milliseconds.
|
|
// That intent will be fulfilled when calling `BackOff`.
|
|
func (r *RetryWithBackoffer) RequestBackOff(ms int) {
|
|
r.mu.Lock()
|
|
r.nextBackoff = max(r.nextBackoff, ms)
|
|
r.mu.Unlock()
|
|
}
|
|
|
|
// Inner returns the reference to the inner `backoffer`.
|
|
func (r *RetryWithBackoffer) Inner() *tikv.Backoffer {
|
|
return r.bo
|
|
}
|