219 lines
7.5 KiB
Go
219 lines
7.5 KiB
Go
// Copyright 2024 PingCAP, Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package utils
|
|
|
|
import (
|
|
"fmt"
|
|
"strings"
|
|
"sync"
|
|
|
|
backuppb "github.com/pingcap/kvproto/pkg/brpb"
|
|
"github.com/pingcap/log"
|
|
"go.uber.org/zap"
|
|
)
|
|
|
|
// UNSAFE! TODO: remove and map them to error types
|
|
var retryableErrorMsg = []string{
|
|
"server closed",
|
|
"connection refused",
|
|
"connection reset by peer",
|
|
"channel closed",
|
|
"error trying to connect",
|
|
"connection closed before message completed",
|
|
"body write aborted",
|
|
"error during dispatch",
|
|
"put object timeout",
|
|
"timeout after",
|
|
"internalerror",
|
|
"not read from or written to within the timeout period",
|
|
"<code>requesttimeout</code>",
|
|
"<code>invalidpart</code>",
|
|
"end of file before message length reached",
|
|
}
|
|
|
|
// non-retryable error messages
|
|
// UNSAFE! TODO: remove and map them to error types
|
|
const (
|
|
ioMsg = "io"
|
|
notFoundMsg = "notfound"
|
|
permissionDeniedMsg = "permissiondenied"
|
|
credentialNotFoundMsg = "credential info not found" // Azure Blob
|
|
)
|
|
|
|
// error messages
|
|
const (
|
|
unreachableRetryMsg = "unreachable retry"
|
|
retryOnKvErrorMsg = "retry on kv error"
|
|
retryOnRegionErrorMsg = "retry on region error"
|
|
clusterIdMismatchMsg = "cluster id mismatch"
|
|
unknownErrorMsg = "unknown error"
|
|
contextCancelledMsg = "context canceled"
|
|
retryOnUnknownErrorMsg = "unknown error, retry it for a few times"
|
|
noRetryOnUnknownErrorMsg = "unknown error, retried too many times, give up"
|
|
retryableStorageErrorMsg = "retryable storage error"
|
|
)
|
|
|
|
type ErrorHandlingResult struct {
|
|
Strategy ErrorHandlingStrategy
|
|
Reason string
|
|
}
|
|
|
|
type ErrorHandlingStrategy int
|
|
|
|
const (
|
|
// StrategyRetry error can be retried but will consume the backoff attempt quota.
|
|
StrategyRetry ErrorHandlingStrategy = iota
|
|
// StrategyGiveUp means unrecoverable error happened and the BR should exit
|
|
// for example:
|
|
// 1. permission not valid.
|
|
// 2. data not found.
|
|
// 3. retry too many times
|
|
StrategyGiveUp
|
|
// StrategyUnknown for StrategyUnknown error
|
|
StrategyUnknown
|
|
)
|
|
|
|
type ErrorContext struct {
|
|
mu sync.Mutex
|
|
// encounter times for one context on a store
|
|
// we may use this value to determine the retry policy
|
|
encounterTimes map[uint64]int
|
|
// unknown error retry limitation.
|
|
// encounter many times error makes Retry to GiveUp.
|
|
encounterTimesLimitation int
|
|
description string
|
|
}
|
|
|
|
func NewErrorContext(scenario string, limitation int) *ErrorContext {
|
|
return &ErrorContext{
|
|
description: scenario,
|
|
encounterTimes: make(map[uint64]int),
|
|
encounterTimesLimitation: limitation,
|
|
}
|
|
}
|
|
|
|
func NewDefaultContext() *ErrorContext {
|
|
return &ErrorContext{
|
|
description: "default",
|
|
encounterTimes: make(map[uint64]int),
|
|
encounterTimesLimitation: 1,
|
|
}
|
|
}
|
|
|
|
func NewZeroRetryContext(scenario string) *ErrorContext {
|
|
return &ErrorContext{
|
|
description: scenario,
|
|
encounterTimes: make(map[uint64]int),
|
|
encounterTimesLimitation: 0,
|
|
}
|
|
}
|
|
|
|
func HandleBackupError(err *backuppb.Error, storeId uint64, ec *ErrorContext) ErrorHandlingResult {
|
|
if err == nil {
|
|
return ErrorHandlingResult{StrategyRetry, unreachableRetryMsg}
|
|
}
|
|
res := handleBackupProtoError(err)
|
|
// try the best effort handle unknown error based on their error message
|
|
if res.Strategy == StrategyUnknown && len(err.Msg) != 0 {
|
|
return HandleUnknownBackupError(err.Msg, storeId, ec)
|
|
}
|
|
return res
|
|
}
|
|
|
|
func handleBackupProtoError(e *backuppb.Error) ErrorHandlingResult {
|
|
switch e.Detail.(type) {
|
|
case *backuppb.Error_KvError:
|
|
return ErrorHandlingResult{StrategyRetry, retryOnKvErrorMsg}
|
|
case *backuppb.Error_RegionError:
|
|
return ErrorHandlingResult{StrategyRetry, retryOnRegionErrorMsg}
|
|
case *backuppb.Error_ClusterIdError:
|
|
return ErrorHandlingResult{StrategyGiveUp, clusterIdMismatchMsg}
|
|
}
|
|
return ErrorHandlingResult{StrategyUnknown, unknownErrorMsg}
|
|
}
|
|
|
|
// HandleUnknownBackupError UNSAFE! TODO: remove this method and map all the current unknown errors to error types
|
|
func HandleUnknownBackupError(msg string, uuid uint64, ec *ErrorContext) ErrorHandlingResult {
|
|
// UNSAFE! TODO: use meaningful error code instead of unstructured message to find failed to write error.
|
|
logger := log.L().With(zap.String("description", ec.description))
|
|
if messageIsNotFoundStorageError(msg) {
|
|
reason := fmt.Sprintf("File or directory not found on TiKV Node (store id: %v). "+
|
|
"workaround: please ensure br and tikv nodes share a same storage and the user of br and tikv has same uid.",
|
|
uuid)
|
|
return ErrorHandlingResult{StrategyGiveUp, reason}
|
|
}
|
|
if messageIsPermissionDeniedStorageError(msg) {
|
|
reason := fmt.Sprintf("I/O permission denied error occurs on TiKV Node(store id: %v). "+
|
|
"workaround: please ensure tikv has permission to read from & write to the storage.",
|
|
uuid)
|
|
return ErrorHandlingResult{StrategyGiveUp, reason}
|
|
}
|
|
if messageIsCredentialNotFoundError(msg) {
|
|
reason := fmt.Sprintf("Credential info not found on TiKV Node (store id: %v). "+
|
|
"workaround: please ensure the credential/access key is correctly configured for the storage.",
|
|
uuid)
|
|
return ErrorHandlingResult{StrategyGiveUp, reason}
|
|
}
|
|
msgLower := strings.ToLower(msg)
|
|
if strings.Contains(msgLower, contextCancelledMsg) {
|
|
return ErrorHandlingResult{StrategyGiveUp, contextCancelledMsg}
|
|
}
|
|
|
|
if MessageIsRetryableStorageError(msg) {
|
|
logger.Warn(retryableStorageErrorMsg, zap.String("error", msg))
|
|
return ErrorHandlingResult{StrategyRetry, retryableStorageErrorMsg}
|
|
}
|
|
|
|
// retry enough on same store
|
|
ec.mu.Lock()
|
|
defer ec.mu.Unlock()
|
|
ec.encounterTimes[uuid]++
|
|
if ec.encounterTimes[uuid] <= ec.encounterTimesLimitation {
|
|
return ErrorHandlingResult{StrategyRetry, retryOnUnknownErrorMsg}
|
|
}
|
|
return ErrorHandlingResult{StrategyGiveUp, noRetryOnUnknownErrorMsg}
|
|
}
|
|
|
|
// messageIsNotFoundStorageError checks whether the message returning from TiKV is "NotFound" storage I/O error
|
|
func messageIsNotFoundStorageError(msg string) bool {
|
|
msgLower := strings.ToLower(msg)
|
|
return strings.Contains(msgLower, ioMsg) && strings.Contains(msgLower, notFoundMsg)
|
|
}
|
|
|
|
// MessageIsPermissionDeniedStorageError checks whether the message returning from TiKV is "PermissionDenied" storage I/O error
|
|
func messageIsPermissionDeniedStorageError(msg string) bool {
|
|
msgLower := strings.ToLower(msg)
|
|
return strings.Contains(msgLower, permissionDeniedMsg)
|
|
}
|
|
|
|
// messageIsCredentialNotFoundError checks whether the message returning from TiKV is credential not found error
|
|
// Currently only supports Azure Blob Storage. AWS S3 credential errors are handled differently by TiKV's internal retry.
|
|
func messageIsCredentialNotFoundError(msg string) bool {
|
|
msgLower := strings.ToLower(msg)
|
|
return strings.Contains(msgLower, credentialNotFoundMsg)
|
|
}
|
|
|
|
// MessageIsRetryableStorageError checks whether the message returning from TiKV is retryable ExternalStorageError.
|
|
func MessageIsRetryableStorageError(msg string) bool {
|
|
msgLower := strings.ToLower(msg)
|
|
// UNSAFE! TODO: Add a error type for retryable connection error.
|
|
for _, errStr := range retryableErrorMsg {
|
|
if strings.Contains(msgLower, errStr) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|