distribution/xfer: fix download fencepost bug

maxDownloadAttempts maps to the daemon configuration flag

    --max-download-attempts int
      Set the max download attempts for each pull (default 5)

and the daemon configuration machinery interprets a value of 0 as "apply
the default value" and not a valid user value (config validation/
normalization bugs notwithstanding). The intention is clearly that this
configuration value should be an upper limit on the number of times the
daemon should try to download a particular layer before giving up. So it
is surprising to have the configuration value interpreted as a _retry_
limit. The daemon will make up to N+1 attempts to download a layer! This
also means users cannot disable retries even if they wanted to.

Fix the fencepost bug so that max attempts really means max attempts,
not max retries. And fix the fencepost bug with the retry-backoff delay
so that the first backoff is 5s, not 10s.

Signed-off-by: Cory Snider <csnider@mirantis.com>
This commit is contained in:
Cory Snider 2023-02-03 13:43:22 -05:00
parent 0c68b655f6
commit 97921915a8
2 changed files with 17 additions and 17 deletions

View file

@ -267,7 +267,7 @@ func (ldm *LayerDownloadManager) makeDownloadFunc(descriptor DownloadDescriptor,
downloadReader io.ReadCloser
size int64
err error
retries int
attempt int = 1
)
defer descriptor.Close()
@ -287,16 +287,16 @@ func (ldm *LayerDownloadManager) makeDownloadFunc(descriptor DownloadDescriptor,
default:
}
retries++
if _, isDNR := err.(DoNotRetry); isDNR || retries > ldm.maxDownloadAttempts {
logrus.Errorf("Download failed after %d attempts: %v", retries, err)
if _, isDNR := err.(DoNotRetry); isDNR || attempt >= ldm.maxDownloadAttempts {
logrus.Errorf("Download failed after %d attempts: %v", attempt, err)
d.err = err
return
}
logrus.Infof("Download failed, retrying (%d/%d): %v", retries, ldm.maxDownloadAttempts, err)
delay := retries * 5
logrus.Infof("Download failed, retrying (%d/%d): %v", attempt, ldm.maxDownloadAttempts, err)
delay := attempt * 5
ticker := time.NewTicker(ldm.waitDuration)
attempt++
selectLoop:
for {

View file

@ -216,7 +216,7 @@ func (d *mockDownloadDescriptor) Download(ctx context.Context, progressOutput pr
if d.retries < d.simulateRetries {
d.retries++
return nil, 0, fmt.Errorf("simulating download attempt %d/%d", d.retries, d.simulateRetries)
return nil, 0, fmt.Errorf("simulating download attempt failure %d/%d", d.retries, d.simulateRetries)
}
return d.mockTarStream(), 0, nil
@ -367,25 +367,25 @@ func TestMaxDownloadAttempts(t *testing.T) {
}{
{
name: "max-attempts=5, succeed at 2nd attempt",
simulateRetries: 2,
simulateRetries: 1,
maxDownloadAttempts: 5,
},
{
name: "max-attempts=5, succeed at 5th attempt",
simulateRetries: 4,
maxDownloadAttempts: 5,
},
{
name: "max-attempts=5, fail at 5th attempt",
simulateRetries: 5,
maxDownloadAttempts: 5,
expectedErr: "simulating download attempt failure 5/5",
},
{
name: "max-attempts=5, fail at 6th attempt",
simulateRetries: 6,
maxDownloadAttempts: 5,
expectedErr: "simulating download attempt 5/6",
},
{
name: "max-attempts=0, fail after 1 attempt",
name: "max-attempts=1, fail after 1 attempt",
simulateRetries: 1,
maxDownloadAttempts: 0,
expectedErr: "simulating download attempt 1/1",
maxDownloadAttempts: 1,
expectedErr: "simulating download attempt failure 1/1",
},
}
for _, tc := range tests {