CCM-16073 - Updated rate limiting behaviour#158
CCM-16073 - Updated rate limiting behaviour#158rhyscoxnhs wants to merge 5 commits intofeature/CCM-16073from
Conversation
a136f1f to
38d0fad
Compare
mjewildnhs
left a comment
There was a problem hiding this comment.
As far as reviewed the admin script
|
|
||
| return { 1, "allowed", 0, effectiveRate } | ||
| local reason = consumedTokens < 1 and "rate_limited" or "allowed" | ||
| local retryAfter = consumedTokens < 1 and 1000 or 0 |
There was a problem hiding this comment.
Do we want to optimise the retry time rather than hardcode to 1s?
We could have it ramp down to 1s based on the period in the recovery.
With the defaults we'll be recovering for 10m with a reduced effectiveRate.
With lower invocationRateLimits (e.g. 10/s) it will take 60s to generate a token.
So there is no point retrying so quickly as there won't be any tokens causing unnecessary spin on the lambda.
Instead we could use the time it takes to generate a token math.ceil(1000 / effectiveRate).
| const [consumedOrFlag, reason, retryAfterMs, effectiveRate] = raw; | ||
|
|
||
| if (allowed === 1) { | ||
| if (reason === "allowed" || reason === "probe") { |
There was a problem hiding this comment.
The reason "probe" isn't returned anymore.
| local isHalfOpen = isOpen and now > switchedAt + cooldownMs | ||
| local isRecovering = (not isOpen) and now < switchedAt + recoveryPeriodMs | ||
|
|
||
| -- Circuit is open and no probe slot is available — reject | ||
| return { 0, "circuit_open", openedUntil - now, 0 } | ||
| end | ||
|
|
||
| -------------------------------------------------------------------------------- | ||
| -- 2. SLIDING WINDOW | ||
| -- | ||
| -- Two windows (current + previous) together approximate a sliding window over | ||
| -- cbWindowPeriodMs. When the current window expires it is promoted to previous | ||
| -- and a fresh current window starts. record-result.lua blends the two windows | ||
| -- using a time-based weight to smooth the error rate across the boundary rather | ||
| -- than resetting it to zero at expiry. | ||
| -- | ||
| -- record-result.lua is responsible for incrementing the counters; this script | ||
| -- is only responsible for rolling the window boundary forward when it expires. | ||
| -------------------------------------------------------------------------------- | ||
| local effectiveRate | ||
|
|
||
| if cbWindowFrom == 0 then | ||
| -- No window exists yet — start one now | ||
| cbWindowFrom = now | ||
| elseif (now - cbWindowFrom) > cbWindowPeriodMs then | ||
| -- Current window has expired — roll it forward | ||
| if (now - cbWindowFrom) > (2 * cbWindowPeriodMs) then | ||
| -- Both current and previous windows are stale: a long quiet period means | ||
| -- old failure counts are no longer relevant to the health of the endpoint. | ||
| cbPrevFailures = 0 | ||
| cbPrevAttempts = 0 | ||
| if isOpen then | ||
| if isHalfOpen then | ||
| effectiveRate = probeRateLimit | ||
| else | ||
| -- Promote current → previous so it can be blended with the new current window | ||
| cbPrevFailures = cbFailures | ||
| cbPrevAttempts = cbAttempts | ||
| return { 0, "circuit_open", (switchedAt + cooldownMs) - now, 0 } | ||
| end | ||
| else | ||
| if isRecovering then | ||
| effectiveRate = targetRateLimit * (now - switchedAt) / recoveryPeriodMs | ||
| else | ||
| effectiveRate = targetRateLimit | ||
| end | ||
| cbFailures = 0 | ||
| cbAttempts = 0 | ||
| cbWindowFrom = now | ||
| end |
There was a problem hiding this comment.
Declaring the isHalfOpen / isRecovering where they are used tidies up and simplifies it.
Further to that I think flipping the isHalfOpen check and labelling it inCooldown makes it simpler, brings the cooldownMs into context and moves away from half open terminology which i think is a bit problematic.
| local isHalfOpen = isOpen and now > switchedAt + cooldownMs | |
| local isRecovering = (not isOpen) and now < switchedAt + recoveryPeriodMs | |
| -- Circuit is open and no probe slot is available — reject | |
| return { 0, "circuit_open", openedUntil - now, 0 } | |
| end | |
| -------------------------------------------------------------------------------- | |
| -- 2. SLIDING WINDOW | |
| -- | |
| -- Two windows (current + previous) together approximate a sliding window over | |
| -- cbWindowPeriodMs. When the current window expires it is promoted to previous | |
| -- and a fresh current window starts. record-result.lua blends the two windows | |
| -- using a time-based weight to smooth the error rate across the boundary rather | |
| -- than resetting it to zero at expiry. | |
| -- | |
| -- record-result.lua is responsible for incrementing the counters; this script | |
| -- is only responsible for rolling the window boundary forward when it expires. | |
| -------------------------------------------------------------------------------- | |
| local effectiveRate | |
| if cbWindowFrom == 0 then | |
| -- No window exists yet — start one now | |
| cbWindowFrom = now | |
| elseif (now - cbWindowFrom) > cbWindowPeriodMs then | |
| -- Current window has expired — roll it forward | |
| if (now - cbWindowFrom) > (2 * cbWindowPeriodMs) then | |
| -- Both current and previous windows are stale: a long quiet period means | |
| -- old failure counts are no longer relevant to the health of the endpoint. | |
| cbPrevFailures = 0 | |
| cbPrevAttempts = 0 | |
| if isOpen then | |
| if isHalfOpen then | |
| effectiveRate = probeRateLimit | |
| else | |
| -- Promote current → previous so it can be blended with the new current window | |
| cbPrevFailures = cbFailures | |
| cbPrevAttempts = cbAttempts | |
| return { 0, "circuit_open", (switchedAt + cooldownMs) - now, 0 } | |
| end | |
| else | |
| if isRecovering then | |
| effectiveRate = targetRateLimit * (now - switchedAt) / recoveryPeriodMs | |
| else | |
| effectiveRate = targetRateLimit | |
| end | |
| cbFailures = 0 | |
| cbAttempts = 0 | |
| cbWindowFrom = now | |
| end | |
| f isOpen then | |
| local inCooldown = now <= switchedAt + cooldownMs | |
| if inCooldown then | |
| return { 0, "circuit_open", (switchedAt + cooldownMs) - now, 0 } | |
| end | |
| effectiveRate = probeRateLimit | |
| else | |
| local isRecovering = now < switchedAt + recoveryPeriodMs | |
| if isRecovering then | |
| effectiveRate = targetRateLimit * (now - switchedAt) / recoveryPeriodMs | |
| else | |
| effectiveRate = targetRateLimit | |
| end | |
| end |
| -- decayPeriodMs. This deliberately slows recovery traffic so a flapping | ||
| -- endpoint is not immediately overwhelmed. | ||
| -- Once decayPeriodMs elapses, openedUntil is cleared and the full rate resumes. | ||
| -- Refill precision: bucketRefilledAt advances by exactly the time required to |
There was a problem hiding this comment.
I hard a hard time understanding what was going on with bucketRefilledAt / generationTime and "preserving fractional" time. I think the comment needs to explain what the problem with using "now" is and how it leads to rounding/leaking/overfilling issues.
| local switchedAt = tonumber(switchedAtRaw or tostring(now)) | ||
| local bucketTokens = tonumber(state[3] or "0") | ||
| local bucketRefilledAt = tonumber(state[4] or "0") | ||
| local needInitSwitchedAt = switchedAtRaw == false or switchedAtRaw == nil |
There was a problem hiding this comment.
Personal preference (is convention) would be isSwitchedAtInitialised and flip the conditional.
90a5096 to
26d8d59
Compare
6259edd to
fe2e5f8
Compare
fe2e5f8 to
9856bc9
Compare
8fb8f8c to
5cb24f9
Compare
e3fb631 to
5cb24f9
Compare
| ); | ||
|
|
||
| if (!gateResult.allowed) { | ||
| const delaySec = Math.ceil(gateResult.retryAfterMs / 1000); |
There was a problem hiding this comment.
Do we need some jitter here to stop thundering heard at the end of cooldown
Description
Context
Type of changes
Checklist
Sensitive Information Declaration
To ensure the utmost confidentiality and protect your and others privacy, we kindly ask you to NOT including PII (Personal Identifiable Information) / PID (Personal Identifiable Data) or any other sensitive data in this PR (Pull Request) and the codebase changes. We will remove any PR that do contain any sensitive information. We really appreciate your cooperation in this matter.