diff --git a/br/pkg/lightning/common/retry.go b/br/pkg/lightning/common/retry.go index 892ae6ab83..789c4f2d20 100644 --- a/br/pkg/lightning/common/retry.go +++ b/br/pkg/lightning/common/retry.go @@ -105,7 +105,9 @@ func isSingleRetryableError(err error) bool { if nerr.Timeout() { return true } - if syscallErr, ok := goerrors.Unwrap(err).(*os.SyscallError); ok { + // the error might be nested, such as *url.Error -> *net.OpError -> *os.SyscallError + var syscallErr *os.SyscallError + if goerrors.As(nerr, &syscallErr) { return syscallErr.Err == syscall.ECONNREFUSED || syscallErr.Err == syscall.ECONNRESET } return false diff --git a/br/pkg/lightning/common/retry_test.go b/br/pkg/lightning/common/retry_test.go index 939f4bb956..114e500b33 100644 --- a/br/pkg/lightning/common/retry_test.go +++ b/br/pkg/lightning/common/retry_test.go @@ -19,6 +19,7 @@ import ( "fmt" "io" "net" + "net/url" "testing" "github.com/go-sql-driver/mysql" @@ -66,6 +67,9 @@ func TestIsRetryableError(t *testing.T) { _, err := net.Dial("tcp", "localhost:65533") require.Error(t, err) require.True(t, IsRetryableError(err)) + // wrap net.OpErr inside url.Error + urlErr := &url.Error{Op: "post", Err: err} + require.True(t, IsRetryableError(urlErr)) // MySQL Errors require.False(t, IsRetryableError(&mysql.MySQLError{})) diff --git a/br/pkg/pdutil/pd.go b/br/pkg/pdutil/pd.go index 244f274bb7..ea057ce839 100644 --- a/br/pkg/pdutil/pd.go +++ b/br/pkg/pdutil/pd.go @@ -41,7 +41,7 @@ const ( maxMsgSize = int(128 * units.MiB) // pd.ScanRegion may return a large response pauseTimeout = 5 * time.Minute // pd request retry time when connection fail - pdRequestRetryTime = 10 + pdRequestRetryTime = 120 // set max-pending-peer-count to a large value to avoid scatter region failed. maxPendingPeerUnlimited uint64 = math.MaxInt32 ) @@ -157,6 +157,7 @@ func pdRequestWithCode( resp *http.Response ) count := 0 + // the total retry duration: 120*1 = 2min for { req, err = http.NewRequestWithContext(ctx, method, reqURL, body) if err != nil {