Skip to content

Commit 3180f76

Browse files
committed
fix exponential backoff overflow
The `backoff` function's wait time calculation completely overflows `time.Duration` on the 55th retry (approximately after 6 hours). This results in zero wait times, leading to the uncontrolled spawn of hundreds of goroutines, which can cause memory exhaustion and OOM kill on linux.
1 parent 1c21c37 commit 3180f76

File tree

1 file changed

+13
-10
lines changed

1 file changed

+13
-10
lines changed

services/http.go

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@ package services
22

33
import (
44
"fmt"
5-
"math"
65
"net/http"
76
"strconv"
87
"time"
@@ -11,15 +10,15 @@ import (
1110
)
1211

1312
const (
14-
// retryWaitSeconds is the base wait time in seconds between retries
15-
retryWaitSeconds = 5 * time.Second
16-
maxRetryWait = 10 * time.Minute
13+
// minRetryWait is the base wait time between retries
14+
minRetryWait = 5 * time.Second
15+
maxRetryWait = 10 * time.Minute
1716
)
1817

1918
// sender is a helper for sending post requests. If the request fails, sender calulates an
2019
// exponential backoff time using retryWaitSeconds and return it as the sleep time.
2120
type sender struct {
22-
failCount int
21+
nextWait time.Duration
2322
}
2423

2524
// post posts data to the specified URL and returns the response, the sleep time in seconds, and any
@@ -38,7 +37,7 @@ func (s *sender) post(req *http.Request, httpClient *http.Client) (*http.Respons
3837
return resp, s.backoff(), err
3938
}
4039

41-
s.failCount = 0
40+
s.nextWait = minRetryWait
4241

4342
var sleepTime int64
4443
if sleepVal := resp.Header.Get(common.SleepHeader); sleepVal != "" {
@@ -66,11 +65,15 @@ func (s *sender) doPost(req *http.Request, httpClient *http.Client) (*http.Respo
6665

6766
// backoff calculates the backoff time in seconds for the next retry.
6867
func (s *sender) backoff() int64 {
69-
wait := time.Duration(math.Pow(2, float64(s.failCount))) * retryWaitSeconds
70-
s.failCount++
68+
if s.nextWait < minRetryWait {
69+
s.nextWait = minRetryWait
70+
}
71+
72+
wait := s.nextWait
7173

72-
if wait > maxRetryWait {
73-
return int64(maxRetryWait.Seconds())
74+
s.nextWait *= 2
75+
if s.nextWait > maxRetryWait {
76+
s.nextWait = maxRetryWait
7477
}
7578

7679
return int64(wait.Seconds())

0 commit comments

Comments
 (0)