From 9856bc97a364c83bbea43f4e0317eda647f80029 Mon Sep 17 00:00:00 2001 From: Rhys Cox Date: Thu, 23 Apr 2026 10:12:50 +0100 Subject: [PATCH 1/3] CCM-16073 - Updated rate limiting behaviour --- .../callbacks/module_perf_runner_lambda.tf | 52 +- .../src/__tests__/admit-lua.test.ts | 528 +++++++++--------- .../src/__tests__/endpoint-gate.test.ts | 132 +++-- .../src/__tests__/handler.test.ts | 178 ++++-- .../src/__tests__/record-result-lua.test.ts | 458 +++++++-------- lambdas/https-client-lambda/src/handler.ts | 350 ++++++------ .../src/services/admit.lua | 229 ++------ .../src/services/endpoint-gate.ts | 59 +- .../src/services/record-result.lua | 242 ++++---- lambdas/perf-runner-lambda/package.json | 6 +- .../src/__tests__/cloudwatch.test.ts | 377 ++++++++++++- .../src/__tests__/elasticache.test.ts | 74 +++ .../src/__tests__/index.test.ts | 48 ++ .../src/__tests__/purge.test.ts | 116 ++++ .../src/__tests__/runner.test.ts | 320 ++++++++++- .../src/__tests__/webhook-verify.test.ts | 173 ++++++ lambdas/perf-runner-lambda/src/cloudwatch.ts | 120 +++- lambdas/perf-runner-lambda/src/elasticache.ts | 52 ++ lambdas/perf-runner-lambda/src/index.ts | 23 +- lambdas/perf-runner-lambda/src/purge.ts | 40 ++ lambdas/perf-runner-lambda/src/runner.ts | 134 ++++- lambdas/perf-runner-lambda/src/types.ts | 40 ++ .../perf-runner-lambda/src/webhook-verify.ts | 59 ++ pnpm-lock.yaml | 18 +- 24 files changed, 2724 insertions(+), 1104 deletions(-) create mode 100644 lambdas/perf-runner-lambda/src/__tests__/elasticache.test.ts create mode 100644 lambdas/perf-runner-lambda/src/__tests__/purge.test.ts create mode 100644 lambdas/perf-runner-lambda/src/__tests__/webhook-verify.test.ts create mode 100644 lambdas/perf-runner-lambda/src/elasticache.ts create mode 100644 lambdas/perf-runner-lambda/src/purge.ts create mode 100644 lambdas/perf-runner-lambda/src/webhook-verify.ts diff --git a/infrastructure/terraform/components/callbacks/module_perf_runner_lambda.tf b/infrastructure/terraform/components/callbacks/module_perf_runner_lambda.tf index 424294a8..f3f57981 100644 --- a/infrastructure/terraform/components/callbacks/module_perf_runner_lambda.tf +++ b/infrastructure/terraform/components/callbacks/module_perf_runner_lambda.tf @@ -40,6 +40,15 @@ module "perf_runner_lambda" { INBOUND_QUEUE_URL = module.sqs_inbound_event.sqs_queue_url TRANSFORM_FILTER_LOG_GROUP = module.client_transform_filter_lambda.cloudwatch_log_group_name DELIVERY_LOG_GROUP_PREFIX = "/aws/lambda/${local.csi}-https-client-" + MOCK_WEBHOOK_LOG_GROUP = var.deploy_mock_clients ? module.mock_webhook_lambda[0].cloudwatch_log_group_name : "" + ELASTICACHE_ENDPOINT = aws_elasticache_serverless_cache.delivery_state.endpoint[0].address + ELASTICACHE_CACHE_NAME = aws_elasticache_serverless_cache.delivery_state.name + ELASTICACHE_IAM_USERNAME = "${var.project}-${var.environment}-${var.component}-elasticache-user" + } + + vpc_config = { + subnet_ids = try(local.acct.private_subnets[local.bc_name], []) + security_group_ids = [aws_security_group.https_client_lambda.id] } } @@ -74,6 +83,22 @@ data "aws_iam_policy_document" "perf_runner_lambda" { ] } + statement { + sid = "SQSPurgeQueue" + effect = "Allow" + + actions = [ + "sqs:PurgeQueue", + ] + + resources = [ + module.sqs_inbound_event.sqs_queue_arn, + "${module.sqs_inbound_event.sqs_queue_arn}-dlq", + "arn:aws:sqs:${var.region}:${var.aws_account_id}:${local.csi}-*-delivery-queue", + "arn:aws:sqs:${var.region}:${var.aws_account_id}:${local.csi}-*-delivery-dlq-queue", + ] + } + statement { sid = "CloudWatchLogsInsightsQuery" effect = "Allow" @@ -83,10 +108,15 @@ data "aws_iam_policy_document" "perf_runner_lambda" { "logs:StopQuery", ] - resources = [ - "arn:aws:logs:${var.region}:${var.aws_account_id}:log-group:${module.client_transform_filter_lambda.cloudwatch_log_group_name}:*", - "arn:aws:logs:${var.region}:${var.aws_account_id}:log-group:/aws/lambda/${local.csi}-https-client-*", - ] + resources = concat( + [ + "arn:aws:logs:${var.region}:${var.aws_account_id}:log-group:${module.client_transform_filter_lambda.cloudwatch_log_group_name}:*", + "arn:aws:logs:${var.region}:${var.aws_account_id}:log-group:/aws/lambda/${local.csi}-https-client-*", + ], + var.deploy_mock_clients ? [ + "arn:aws:logs:${var.region}:${var.aws_account_id}:log-group:${module.mock_webhook_lambda[0].cloudwatch_log_group_name}:*", + ] : [], + ) } statement { @@ -99,4 +129,18 @@ data "aws_iam_policy_document" "perf_runner_lambda" { resources = ["*"] } + + statement { + sid = "ElastiCacheConnect" + effect = "Allow" + + actions = [ + "elasticache:Connect", + ] + + resources = [ + aws_elasticache_serverless_cache.delivery_state.arn, + aws_elasticache_user.delivery_state_iam.arn, + ] + } } diff --git a/lambdas/https-client-lambda/src/__tests__/admit-lua.test.ts b/lambdas/https-client-lambda/src/__tests__/admit-lua.test.ts index 6aab4727..f4906cf2 100644 --- a/lambdas/https-client-lambda/src/__tests__/admit-lua.test.ts +++ b/lambdas/https-client-lambda/src/__tests__/admit-lua.test.ts @@ -1,32 +1,32 @@ import admitLuaSrc from "services/admit.lua"; import { createRedisStore, evalLua } from "__tests__/helpers/lua-redis-mock"; -// ARGV: [now, capacity, refillPerSec, cooldownMs, decayPeriodMs, cbWindowPeriodMs, cbProbeIntervalMs] -// KEYS: [cbKey, rlKey] -// Returns: [allowed (0|1), reason, retryAfterMs, effectiveRate] +// ARGV: [now, capacity, targetRateLimit, cooldownMs, recoveryPeriodMs, probeRateLimit, targetBatchSize] +// KEYS: [epKey] +// Returns: [consumedTokens, reason, retryAfterMs, effectiveRate] type AdmitArgs = { now: number; capacity: number; - refillPerSec: number; + targetRateLimit: number; cooldownMs: number; - decayPeriodMs: number; - cbWindowPeriodMs: number; - cbProbeIntervalMs: number; + recoveryPeriodMs: number; + probeRateLimit: number; + targetBatchSize: number; }; const defaultArgs: AdmitArgs = { now: 1_000_000, - capacity: 10, - refillPerSec: 10, - cooldownMs: 60_000, - decayPeriodMs: 300_000, - cbWindowPeriodMs: 60_000, - cbProbeIntervalMs: 60_000, + capacity: 2250, + targetRateLimit: 10, + cooldownMs: 120_000, + recoveryPeriodMs: 600_000, + probeRateLimit: 1 / 60, + targetBatchSize: 1, }; type AdmitResult = { - allowed: number; + consumedTokens: number; reason: string; retryAfterMs: number; effectiveRate: number; @@ -40,20 +40,20 @@ function runAdmit( const merged = { ...defaultArgs, ...args }; const raw = evalLua( admitLuaSrc, - [`cb:${targetId}`, `rl:${targetId}`], + [`ep:${targetId}`], [ merged.now.toString(), merged.capacity.toString(), - merged.refillPerSec.toString(), + merged.targetRateLimit.toString(), merged.cooldownMs.toString(), - merged.decayPeriodMs.toString(), - merged.cbWindowPeriodMs.toString(), - merged.cbProbeIntervalMs.toString(), + merged.recoveryPeriodMs.toString(), + merged.probeRateLimit.toString(), + merged.targetBatchSize.toString(), ], store, ) as [number, string, number, number]; return { - allowed: raw[0], + consumedTokens: raw[0], reason: raw[1], retryAfterMs: raw[2], effectiveRate: raw[3], @@ -62,399 +62,391 @@ function runAdmit( describe("admit.lua", () => { describe("rate limiting", () => { - it("allows the first request with full token bucket", () => { - const store = createRedisStore(); - const { allowed, effectiveRate, reason, retryAfterMs } = runAdmit(store); - - expect(allowed).toBe(1); - expect(reason).toBe("allowed"); - expect(retryAfterMs).toBe(0); - expect(effectiveRate).toBe(10); - }); - - it("depletes tokens on consecutive calls and rejects when empty", () => { + it("enters recovery ramp-up on a fresh endpoint with no prior state", () => { const store = createRedisStore(); + const now = 1_000_000; - for (let i = 0; i < 10; i++) { - const { allowed } = runAdmit(store); - expect(allowed).toBe(1); - } + const { consumedTokens, effectiveRate, reason } = runAdmit(store, { + now, + targetRateLimit: 10, + }); - const { allowed, reason } = runAdmit(store); - expect(allowed).toBe(0); + expect(consumedTokens).toBe(0); expect(reason).toBe("rate_limited"); + expect(effectiveRate).toBe(0); }); - it("returns retryAfterMs when rate limited", () => { + it("persists switched_at on first contact so recovery ramp progresses", () => { const store = createRedisStore(); + const now = 1_000_000; - for (let i = 0; i < 10; i++) { - runAdmit(store); - } + runAdmit(store, { now, targetRateLimit: 10 }); - const { retryAfterMs } = runAdmit(store); - expect(retryAfterMs).toBe(1000); + const epHash = store.get("ep:t1")!; + expect(epHash.get("switched_at")).toBe(now.toString()); }); - it("reports effective rate when rate limited", () => { + it("ramps up rate on subsequent calls after fresh endpoint initialisation", () => { const store = createRedisStore(); + const now = 1_000_000; + const later = now + 60_000; - for (let i = 0; i < 10; i++) { - runAdmit(store); - } + runAdmit(store, { now, targetRateLimit: 10 }); - const { effectiveRate } = runAdmit(store); - expect(effectiveRate).toBe(10); + const { consumedTokens, reason } = runAdmit(store, { + now: later, + targetRateLimit: 10, + }); + + expect(consumedTokens).toBeGreaterThanOrEqual(1); + expect(reason).toBe("allowed"); }); - it("refills tokens over time", () => { + it("allows a single request when bucket has tokens from refill", () => { const store = createRedisStore(); const now = 1_000_000; + store.set( + "ep:t1", + new Map([ + ["bucket_tokens", "0"], + ["bucket_refilled_at", "0"], + ["switched_at", "0"], + ]), + ); - for (let i = 0; i < 10; i++) { - runAdmit(store, { now }); - } - - const denied = runAdmit(store, { now }); - expect(denied.allowed).toBe(0); + const { consumedTokens, reason, retryAfterMs } = runAdmit(store, { + now, + targetRateLimit: 10, + }); - const refilled = runAdmit(store, { now: now + 1000 }); - expect(refilled.allowed).toBe(1); + expect(consumedTokens).toBe(1); + expect(reason).toBe("allowed"); + expect(retryAfterMs).toBe(0); }); - it("caps tokens at capacity", () => { + it("consumes up to targetBatchSize tokens", () => { const store = createRedisStore(); const now = 1_000_000; + store.set( + "ep:t1", + new Map([ + ["bucket_tokens", "5"], + ["bucket_refilled_at", now.toString()], + ["switched_at", "0"], + ]), + ); - runAdmit(store, { now, capacity: 5, refillPerSec: 100 }); - - // Advance 10 seconds — would add 1000 tokens without cap - runAdmit(store, { now: now + 10_000, capacity: 5, refillPerSec: 100 }); - - const rlHash = store.get("rl:t1")!; - // Refill capped to capacity (5), then one consumed → 4 - expect(Number(rlHash.get("tokens"))).toBe(4); + const { consumedTokens } = runAdmit(store, { + now, + targetBatchSize: 3, + }); + expect(consumedTokens).toBe(3); }); - it("handles zero refill rate", () => { + it("consumes all available when batch exceeds available tokens", () => { const store = createRedisStore(); + const now = 1_000_000; + store.set( + "ep:t1", + new Map([ + ["bucket_tokens", "2"], + ["bucket_refilled_at", now.toString()], + ["switched_at", "0"], + ]), + ); - for (let i = 0; i < 10; i++) { - runAdmit(store, { refillPerSec: 0 }); - } - - const { allowed, reason, retryAfterMs } = runAdmit(store, { - refillPerSec: 0, + const { consumedTokens } = runAdmit(store, { + now, + targetBatchSize: 5, }); - expect(allowed).toBe(0); - expect(reason).toBe("rate_limited"); - expect(retryAfterMs).toBe(1000); + expect(consumedTokens).toBe(2); }); - }); - describe("circuit breaker", () => { - it("rejects when circuit is open", () => { + it("returns rate_limited when no tokens available", () => { const store = createRedisStore(); const now = 1_000_000; - const openedUntil = now + 60_000; - store.set( - "cb:t1", + "ep:t1", new Map([ - ["opened_until_ms", openedUntil.toString()], - ["last_probe_ms", now.toString()], + ["bucket_tokens", "0"], + ["bucket_refilled_at", now.toString()], + ["switched_at", "0"], ]), ); - const { allowed, effectiveRate, reason } = runAdmit(store, { now }); - expect(allowed).toBe(0); - expect(reason).toBe("circuit_open"); - expect(effectiveRate).toBe(0); + const { consumedTokens, reason, retryAfterMs } = runAdmit(store, { now }); + expect(consumedTokens).toBe(0); + expect(reason).toBe("rate_limited"); + expect(retryAfterMs).toBe(1000); }); - it("returns retryAfterMs for open circuit", () => { + it("refills tokens over time", () => { const store = createRedisStore(); const now = 1_000_000; - const openedUntil = now + 30_000; - store.set( - "cb:t1", + "ep:t1", new Map([ - ["opened_until_ms", openedUntil.toString()], - ["last_probe_ms", now.toString()], + ["bucket_tokens", "0"], + ["bucket_refilled_at", now.toString()], + ["switched_at", "0"], ]), ); - const { retryAfterMs } = runAdmit(store, { now }); - expect(retryAfterMs).toBe(30_000); + const { consumedTokens } = runAdmit(store, { + now: now + 1000, + targetRateLimit: 10, + }); + expect(consumedTokens).toBe(1); }); - it("allows probe when probe interval has elapsed", () => { + it("caps tokens at capacity", () => { const store = createRedisStore(); const now = 1_000_000; - const openedUntil = now + 120_000; - const lastProbe = now - 61_000; - store.set( - "cb:t1", + "ep:t1", new Map([ - ["opened_until_ms", openedUntil.toString()], - ["last_probe_ms", lastProbe.toString()], + ["bucket_tokens", "0"], + ["bucket_refilled_at", "0"], + ["switched_at", "0"], ]), ); - const { allowed, effectiveRate, reason, retryAfterMs } = runAdmit(store, { + const { consumedTokens } = runAdmit(store, { now, - cbProbeIntervalMs: 60_000, + capacity: 5, + targetRateLimit: 100, + targetBatchSize: 10, }); - expect(allowed).toBe(1); - expect(reason).toBe("probe"); - expect(retryAfterMs).toBe(0); - expect(effectiveRate).toBe(0); + expect(consumedTokens).toBe(5); }); - it("updates last_probe_ms after allowing a probe", () => { + it("handles zero refill rate", () => { const store = createRedisStore(); const now = 1_000_000; - const openedUntil = now + 120_000; - const lastProbe = now - 61_000; - store.set( - "cb:t1", + "ep:t1", new Map([ - ["opened_until_ms", openedUntil.toString()], - ["last_probe_ms", lastProbe.toString()], + ["bucket_tokens", "0"], + ["bucket_refilled_at", now.toString()], + ["switched_at", "0"], ]), ); - runAdmit(store, { now, cbProbeIntervalMs: 60_000 }); - - const cbHash = store.get("cb:t1")!; - expect(cbHash.get("last_probe_ms")).toBe(now.toString()); + const { consumedTokens, reason } = runAdmit(store, { + now: now + 10_000, + targetRateLimit: 0, + }); + expect(consumedTokens).toBe(0); + expect(reason).toBe("rate_limited"); }); - it("does not probe when interval has not elapsed", () => { + it("preserves fractional refill time (bucketRefilledAt += generationTime, not now)", () => { const store = createRedisStore(); const now = 1_000_000; - const openedUntil = now + 120_000; - const lastProbe = now - 30_000; - store.set( - "cb:t1", + "ep:t1", new Map([ - ["opened_until_ms", openedUntil.toString()], - ["last_probe_ms", lastProbe.toString()], + ["bucket_tokens", "0"], + ["bucket_refilled_at", (now - 150).toString()], + ["switched_at", "0"], ]), ); - const { allowed, reason } = runAdmit(store, { - now, - cbProbeIntervalMs: 60_000, - }); - expect(allowed).toBe(0); - expect(reason).toBe("circuit_open"); + runAdmit(store, { now, targetRateLimit: 10 }); + + const epHash = store.get("ep:t1")!; + const refilledAt = Number(epHash.get("bucket_refilled_at")); + // 1 token generated at rate 10/s takes 100ms, so refilledAt = (now-150) + 100 = now - 50 + expect(refilledAt).toBe(now - 50); }); + }); - it("does not probe when cbProbeIntervalMs is 0", () => { + describe("circuit breaker states", () => { + it("blocks completely when circuit is open during cooldown", () => { const store = createRedisStore(); const now = 1_000_000; - const openedUntil = now + 120_000; + const switchedAt = now - 10_000; store.set( - "cb:t1", + "ep:t1", new Map([ - ["opened_until_ms", openedUntil.toString()], - ["last_probe_ms", "0"], + ["is_open", "1"], + ["switched_at", switchedAt.toString()], + ["bucket_tokens", "100"], ]), ); - const { allowed, reason } = runAdmit(store, { + const { consumedTokens, reason } = runAdmit(store, { now, - cbProbeIntervalMs: 0, + cooldownMs: 120_000, }); - expect(allowed).toBe(0); + expect(consumedTokens).toBe(0); expect(reason).toBe("circuit_open"); }); - }); - describe("sliding window", () => { - it("initialises cbWindowFrom on first call", () => { + it("does not consume bucket tokens when fully open", () => { const store = createRedisStore(); const now = 1_000_000; - - runAdmit(store, { now }); - - const cbHash = store.get("cb:t1")!; - expect(cbHash.get("cb_window_from")).toBe(now.toString()); - }); - - it("rolls current window to previous when period expires", () => { - const store = createRedisStore(); - const cbWindowPeriodMs = 60_000; - const t0 = 1_000_000; - const t1 = t0 + cbWindowPeriodMs + 1; + const switchedAt = now - 10_000; store.set( - "cb:t1", + "ep:t1", new Map([ - ["cb_window_from", t0.toString()], - ["cb_failures", "5"], - ["cb_attempts", "10"], - ["cb_prev_failures", "0"], - ["cb_prev_attempts", "0"], + ["is_open", "1"], + ["switched_at", switchedAt.toString()], + ["bucket_tokens", "100"], + ["bucket_refilled_at", now.toString()], ]), ); - runAdmit(store, { now: t1, cbWindowPeriodMs }); + runAdmit(store, { now, cooldownMs: 120_000 }); - const cbHash = store.get("cb:t1")!; - expect(cbHash.get("cb_prev_failures")).toBe("5"); - expect(cbHash.get("cb_prev_attempts")).toBe("10"); - expect(cbHash.get("cb_failures")).toBe("0"); - expect(cbHash.get("cb_attempts")).toBe("0"); - expect(cbHash.get("cb_window_from")).toBe(t1.toString()); + const epHash = store.get("ep:t1")!; + expect(Number(epHash.get("bucket_tokens"))).toBe(100); }); - it("clears both windows when gap exceeds two periods", () => { + it("returns retryAfterMs for open circuit", () => { const store = createRedisStore(); - const cbWindowPeriodMs = 60_000; - const t0 = 1_000_000; - const t1 = t0 + 2 * cbWindowPeriodMs + 1; + const now = 1_000_000; + const switchedAt = now - 10_000; store.set( - "cb:t1", + "ep:t1", new Map([ - ["cb_window_from", t0.toString()], - ["cb_failures", "5"], - ["cb_attempts", "10"], - ["cb_prev_failures", "3"], - ["cb_prev_attempts", "7"], + ["is_open", "1"], + ["switched_at", switchedAt.toString()], ]), ); - runAdmit(store, { now: t1, cbWindowPeriodMs }); - - const cbHash = store.get("cb:t1")!; - expect(cbHash.get("cb_prev_failures")).toBe("0"); - expect(cbHash.get("cb_prev_attempts")).toBe("0"); - expect(cbHash.get("cb_failures")).toBe("0"); - expect(cbHash.get("cb_attempts")).toBe("0"); - expect(cbHash.get("cb_window_from")).toBe(t1.toString()); + const { retryAfterMs } = runAdmit(store, { now, cooldownMs: 120_000 }); + expect(retryAfterMs).toBe(110_000); }); - }); - describe("decay scaling", () => { - it("applies reduced rate during decay period", () => { + it("uses probeRateLimit when half-open (after cooldown)", () => { const store = createRedisStore(); - const closedAt = 1_000_000; - const decayPeriodMs = 300_000; - const halfwayThrough = closedAt + decayPeriodMs / 2; + const now = 1_000_000; + const switchedAt = now - 130_000; - store.set("cb:t1", new Map([["opened_until_ms", closedAt.toString()]])); + store.set( + "ep:t1", + new Map([ + ["is_open", "1"], + ["switched_at", switchedAt.toString()], + ["bucket_tokens", "0"], + ["bucket_refilled_at", (now - 60_000).toString()], + ]), + ); const { effectiveRate } = runAdmit(store, { - now: halfwayThrough, - refillPerSec: 10, - decayPeriodMs, - }); - expect(effectiveRate).toBe(5); - }); - - it("uses full rate after decay period ends", () => { - const store = createRedisStore(); - const closedAt = 1_000_000; - const decayPeriodMs = 300_000; - const afterDecay = closedAt + decayPeriodMs + 1; - - store.set("cb:t1", new Map([["opened_until_ms", closedAt.toString()]])); - - const { allowed, effectiveRate } = runAdmit(store, { - now: afterDecay, - refillPerSec: 10, - decayPeriodMs, + now, + cooldownMs: 120_000, + probeRateLimit: 1 / 60, }); - expect(allowed).toBe(1); - expect(effectiveRate).toBe(10); + expect(effectiveRate).toBeCloseTo(1 / 60, 5); }); - it("clamps minimum effective rate to 1", () => { + it("uses recovery ramp when closed during recovery period", () => { const store = createRedisStore(); - const closedAt = 1_000_000; - const decayPeriodMs = 300_000; - const veryEarly = closedAt + 1; + const switchedAt = 1_000_000; + const recoveryPeriodMs = 600_000; + const now = switchedAt + recoveryPeriodMs / 2; - store.set("cb:t1", new Map([["opened_until_ms", closedAt.toString()]])); + store.set( + "ep:t1", + new Map([ + ["is_open", "0"], + ["switched_at", switchedAt.toString()], + ["bucket_tokens", "0"], + ["bucket_refilled_at", "0"], + ]), + ); const { effectiveRate } = runAdmit(store, { - now: veryEarly, - refillPerSec: 10, - decayPeriodMs, + now, + targetRateLimit: 10, + recoveryPeriodMs, }); - expect(effectiveRate).toBeGreaterThanOrEqual(1); - }); - - it("clears openedUntil when decay period fully elapses", () => { - const store = createRedisStore(); - const closedAt = 1_000_000; - const decayPeriodMs = 300_000; - const afterDecay = closedAt + decayPeriodMs + 1; - - store.set("cb:t1", new Map([["opened_until_ms", closedAt.toString()]])); - - runAdmit(store, { now: afterDecay, decayPeriodMs }); - - const cbHash = store.get("cb:t1")!; - expect(cbHash.get("opened_until_ms")).toBe("0"); + expect(effectiveRate).toBe(5); }); - it("does not decay when decayPeriodMs is 0", () => { + it("uses full rate when closed and past recovery period", () => { const store = createRedisStore(); - const closedAt = 1_000_000; + const switchedAt = 100_000; + const recoveryPeriodMs = 600_000; + const now = switchedAt + recoveryPeriodMs + 1; - store.set("cb:t1", new Map([["opened_until_ms", closedAt.toString()]])); + store.set( + "ep:t1", + new Map([ + ["is_open", "0"], + ["switched_at", switchedAt.toString()], + ["bucket_tokens", "0"], + ["bucket_refilled_at", "0"], + ]), + ); - const { allowed, effectiveRate } = runAdmit(store, { - now: closedAt + 1, - refillPerSec: 10, - decayPeriodMs: 0, + const { effectiveRate } = runAdmit(store, { + now, + targetRateLimit: 10, + recoveryPeriodMs, }); - expect(allowed).toBe(1); expect(effectiveRate).toBe(10); }); }); describe("state persistence", () => { - it("persists token count and last_refill_ms", () => { + it("persists bucket_tokens and bucket_refilled_at", () => { const store = createRedisStore(); - runAdmit(store, { now: 1_000_000, capacity: 5 }); + const now = 1_000_000; + store.set( + "ep:t1", + new Map([ + ["bucket_tokens", "5"], + ["bucket_refilled_at", now.toString()], + ["switched_at", "0"], + ]), + ); - const rlHash = store.get("rl:t1")!; - expect(rlHash.get("tokens")).toBeDefined(); - expect(rlHash.get("last_refill_ms")).toBe("1000000"); + runAdmit(store, { now, targetBatchSize: 2 }); + + const epHash = store.get("ep:t1")!; + expect(Number(epHash.get("bucket_tokens"))).toBe(3); }); - it("persists circuit breaker fields", () => { + it("does not write sampling or circuit fields", () => { const store = createRedisStore(); - runAdmit(store, { now: 1_000_000 }); - - const cbHash = store.get("cb:t1")!; - expect(cbHash.has("opened_until_ms")).toBe(true); - expect(cbHash.has("cb_window_from")).toBe(true); - expect(cbHash.has("cb_failures")).toBe(true); - expect(cbHash.has("cb_attempts")).toBe(true); - expect(cbHash.has("cb_prev_failures")).toBe(true); - expect(cbHash.has("cb_prev_attempts")).toBe(true); + runAdmit(store, { + now: 10_000, + }); + + const epHash = store.get("ep:t1")!; + expect(epHash.has("cur_attempts")).toBe(false); + expect(epHash.has("cur_failures")).toBe(false); + expect(epHash.has("sample_till")).toBe(false); }); it("isolates state between targets", () => { const store = createRedisStore(); - runAdmit(store, {}, "target-a"); - runAdmit(store, {}, "target-b"); + store.set( + "ep:target-a", + new Map([ + ["bucket_tokens", "5"], + ["bucket_refilled_at", "10000"], + ]), + ); + store.set( + "ep:target-b", + new Map([ + ["bucket_tokens", "3"], + ["bucket_refilled_at", "10000"], + ]), + ); + + runAdmit(store, { now: 10_000 }, "target-a"); + runAdmit(store, { now: 10_000 }, "target-b"); - expect(store.has("cb:target-a")).toBe(true); - expect(store.has("cb:target-b")).toBe(true); - expect(store.has("rl:target-a")).toBe(true); - expect(store.has("rl:target-b")).toBe(true); + expect(store.has("ep:target-a")).toBe(true); + expect(store.has("ep:target-b")).toBe(true); }); }); }); diff --git a/lambdas/https-client-lambda/src/__tests__/endpoint-gate.test.ts b/lambdas/https-client-lambda/src/__tests__/endpoint-gate.test.ts index efbc6d88..2cc8cc31 100644 --- a/lambdas/https-client-lambda/src/__tests__/endpoint-gate.test.ts +++ b/lambdas/https-client-lambda/src/__tests__/endpoint-gate.test.ts @@ -11,13 +11,13 @@ const mockDisconnect = jest.fn().mockResolvedValue(undefined); const mockOn = jest.fn(); const defaultConfig: EndpointGateConfig = { - burstCapacity: 10, - cbProbeIntervalMs: 60_000, - decayPeriodMs: 300_000, - cbWindowPeriodMs: 60_000, - cbErrorThreshold: 0.5, - cbMinAttempts: 10, - cbCooldownMs: 60_000, + burstCapacity: 2250, + probeRateLimit: 1 / 60, + recoveryPeriodMs: 600_000, + samplePeriodMs: 300_000, + failureThreshold: 0.3, + minAttempts: 5, + cooldownPeriodMs: 120_000, }; const mockRedis = { @@ -34,12 +34,23 @@ beforeEach(() => { }); describe("admit", () => { - it("returns allowed when tokens available", async () => { - mockSendCommand.mockResolvedValueOnce([1, "allowed", 0, 10]); + it("returns allowed with consumedTokens when tokens available", async () => { + mockSendCommand.mockResolvedValueOnce([5, "allowed", 0, 10]); - const result = await admit(mockRedis, "target-1", 10, true, defaultConfig); + const result = await admit( + mockRedis, + "target-1", + 10, + true, + 5, + defaultConfig, + ); - expect(result).toEqual({ allowed: true, probe: false, effectiveRate: 10 }); + expect(result).toEqual({ + allowed: true, + consumedTokens: 5, + effectiveRate: 10, + }); expect(mockSendCommand).toHaveBeenCalledWith( expect.arrayContaining(["EVALSHA"]), ); @@ -48,7 +59,14 @@ describe("admit", () => { it("returns rate_limited when tokens exhausted", async () => { mockSendCommand.mockResolvedValueOnce([0, "rate_limited", 1000, 10]); - const result = await admit(mockRedis, "target-1", 10, false, defaultConfig); + const result = await admit( + mockRedis, + "target-1", + 10, + false, + 5, + defaultConfig, + ); expect(result).toEqual({ allowed: false, @@ -58,18 +76,17 @@ describe("admit", () => { }); }); - it("returns allowed with probe flag when circuit is open but probe slot is available", async () => { - mockSendCommand.mockResolvedValueOnce([1, "probe", 0, 0]); - - const result = await admit(mockRedis, "target-1", 10, true, defaultConfig); - - expect(result).toEqual({ allowed: true, probe: true, effectiveRate: 0 }); - }); - - it("returns circuit_open without probe slot", async () => { + it("returns circuit_open when circuit is fully open", async () => { mockSendCommand.mockResolvedValueOnce([0, "circuit_open", 30_000, 0]); - const result = await admit(mockRedis, "target-1", 10, true, defaultConfig); + const result = await admit( + mockRedis, + "target-1", + 10, + true, + 5, + defaultConfig, + ); expect(result).toEqual({ allowed: false, @@ -84,9 +101,20 @@ describe("admit", () => { .mockRejectedValueOnce(new Error("NOSCRIPT No matching script")) .mockResolvedValueOnce([1, "allowed", 0, 10]); - const result = await admit(mockRedis, "target-1", 10, true, defaultConfig); + const result = await admit( + mockRedis, + "target-1", + 10, + true, + 1, + defaultConfig, + ); - expect(result).toEqual({ allowed: true, probe: false, effectiveRate: 10 }); + expect(result).toEqual({ + allowed: true, + consumedTokens: 1, + effectiveRate: 10, + }); expect(mockSendCommand).toHaveBeenCalledTimes(2); expect(mockSendCommand).toHaveBeenNthCalledWith( 1, @@ -98,25 +126,33 @@ describe("admit", () => { ); }); - it("passes cbProbeIntervalMs=0 when circuit breaker is disabled", async () => { + it("passes probeRateLimit=0 when circuit breaker is disabled", async () => { mockSendCommand.mockResolvedValueOnce([1, "allowed", 0, 10]); - await admit(mockRedis, "target-1", 10, false, defaultConfig); + await admit(mockRedis, "target-1", 10, false, 1, defaultConfig); - // EVALSHA layout: [EVALSHA, sha, keyCount, cbKey, rlKey, now, capacity, refillPerSec, cooldownMs, decayPeriodMs, cbWindowPeriodMs, cbProbeIntervalMs] const args = mockSendCommand.mock.calls[0]![0] as string[]; - const cbProbeIntervalArg = args[11]; - expect(cbProbeIntervalArg).toBe("0"); + const probeRateArg = args[9]; + expect(probeRateArg).toBe("0"); }); - it("passes cbKey first, rlKey second", async () => { + it("passes single epKey", async () => { mockSendCommand.mockResolvedValueOnce([1, "allowed", 0, 5]); - await admit(mockRedis, "my-target", 5, true, defaultConfig); + await admit(mockRedis, "my-target", 5, true, 1, defaultConfig); const args = mockSendCommand.mock.calls[0]![0] as string[]; - expect(args[3]).toBe("cb:{my-target}"); - expect(args[4]).toBe("rl:{my-target}"); + expect(args[3]).toBe("ep:{my-target}"); + }); + + it("passes targetBatchSize as ARGV", async () => { + mockSendCommand.mockResolvedValueOnce([3, "allowed", 0, 10]); + + await admit(mockRedis, "target-1", 10, true, 7, defaultConfig); + + const args = mockSendCommand.mock.calls[0]![0] as string[]; + const batchSizeArg = args[10]; + expect(batchSizeArg).toBe("7"); }); }); @@ -130,6 +166,7 @@ describe("evalScript", () => { "target-1", 10, true, + 1, defaultConfig, ).catch((error: unknown) => error); @@ -149,6 +186,7 @@ describe("evalScript", () => { "target-1", 10, true, + 1, defaultConfig, ).catch((error: unknown) => error); @@ -165,7 +203,8 @@ describe("recordResult", () => { const result = await recordResult( mockRedis, "target-1", - true, + 5, + 0, defaultConfig, ); @@ -181,7 +220,8 @@ describe("recordResult", () => { const result = await recordResult( mockRedis, "target-1", - false, + 5, + 5, defaultConfig, ); @@ -194,7 +234,8 @@ describe("recordResult", () => { const result = await recordResult( mockRedis, "target-1", - false, + 5, + 1, defaultConfig, ); @@ -209,7 +250,8 @@ describe("recordResult", () => { const result = await recordResult( mockRedis, "target-1", - true, + 1, + 0, defaultConfig, ); @@ -217,12 +259,22 @@ describe("recordResult", () => { expect(mockSendCommand).toHaveBeenCalledTimes(2); }); - it("passes correct cb key for target", async () => { + it("passes correct ep key for target", async () => { + mockSendCommand.mockResolvedValueOnce([1, "closed"]); + + await recordResult(mockRedis, "my-target", 1, 0, defaultConfig); + + const args = mockSendCommand.mock.calls[0]![0] as string[]; + expect(args[3]).toBe("ep:{my-target}"); + }); + + it("passes consumedTokens and processingFailures as ARGV", async () => { mockSendCommand.mockResolvedValueOnce([1, "closed"]); - await recordResult(mockRedis, "my-target", true, defaultConfig); + await recordResult(mockRedis, "target-1", 8, 3, defaultConfig); const args = mockSendCommand.mock.calls[0]![0] as string[]; - expect(args[3]).toBe("cb:{my-target}"); + expect(args[5]).toBe("8"); + expect(args[6]).toBe("3"); }); }); diff --git a/lambdas/https-client-lambda/src/__tests__/handler.test.ts b/lambdas/https-client-lambda/src/__tests__/handler.test.ts index 3b8ad521..a2b7e8b4 100644 --- a/lambdas/https-client-lambda/src/__tests__/handler.test.ts +++ b/lambdas/https-client-lambda/src/__tests__/handler.test.ts @@ -3,7 +3,6 @@ import { DEFAULT_TARGET, makeRecord, } from "__tests__/fixtures/handler-fixtures"; -import { VisibilityManagedError } from "services/visibility-managed-error"; jest.mock("@nhs-notify-client-callbacks/logger", () => ({ logger: { @@ -74,17 +73,20 @@ jest.mock("services/redis-client", () => ({ getRedisClient: (...args: unknown[]) => mockGetRedisClient(...args), })); +jest.mock("services/delivery-observability", () => ({ + recordAdmissionDenied: jest.fn(), + recordCircuitBreakerClosed: jest.fn(), + recordCircuitBreakerOpen: jest.fn(), + recordDeliveryAttempt: jest.fn(), + recordDeliveryDuration: jest.fn(), + recordDeliveryFailure: jest.fn(), + recordDeliveryPermanentFailure: jest.fn(), + recordDeliveryRateLimited: jest.fn(), + recordDeliverySuccess: jest.fn(), + recordRetryWindowExhausted: jest.fn(), +})); + jest.mock("services/delivery-metrics", () => ({ - emitAdmissionDenied: jest.fn(), - emitCircuitBreakerClosed: jest.fn(), - emitCircuitBreakerOpen: jest.fn(), - emitDeliveryAttempt: jest.fn(), - emitDeliveryDuration: jest.fn(), - emitDeliveryFailure: jest.fn(), - emitDeliveryPermanentFailure: jest.fn(), - emitDeliverySuccess: jest.fn(), - emitRateLimited: jest.fn(), - emitRetryWindowExhausted: jest.fn(), flushMetrics: jest.fn().mockResolvedValue(undefined), resetMetrics: jest.fn(), })); @@ -106,12 +108,12 @@ describe("processRecords", () => { mockJitteredBackoff.mockReturnValue(5); mockIsWindowExhausted.mockReturnValue(false); mockHandleRateLimitedRecord.mockRejectedValue( - new VisibilityManagedError("Rate limited — requeue"), + new Error("Rate limited — requeue"), ); mockGetRedisClient.mockResolvedValue({}); mockAdmit.mockResolvedValue({ allowed: true, - probe: false, + consumedTokens: 100, effectiveRate: 10, }); mockRecordResult.mockResolvedValue({ ok: true, state: "closed" }); @@ -159,7 +161,7 @@ describe("processRecords", () => { expect(failures).toEqual([{ itemIdentifier: "msg-1" }]); }); - it("returns failure for 429 rate-limited responses", async () => { + it("returns failure for 429 when handleRateLimitedRecord rejects", async () => { mockDeliverPayload.mockResolvedValue({ outcome: "rate_limited", retryAfterHeader: "60", @@ -177,7 +179,7 @@ describe("processRecords", () => { ); }); - it("processes multiple records independently", async () => { + it("processes multiple records in a single target batch", async () => { const record1 = makeRecord({ messageId: "msg-1" }); const record2 = makeRecord({ messageId: "msg-2" }); @@ -191,25 +193,45 @@ describe("processRecords", () => { const failures = await processRecords([record1, record2]); expect(failures).toEqual([{ itemIdentifier: "msg-2" }]); + expect(mockAdmit).toHaveBeenCalledTimes(1); + }); + + it("delivers only admitted records when consumedTokens is less than batch size", async () => { + const record1 = makeRecord({ messageId: "msg-1" }); + const record2 = makeRecord({ messageId: "msg-2" }); + const record3 = makeRecord({ messageId: "msg-3" }); + + mockAdmit.mockResolvedValue({ + allowed: true, + consumedTokens: 1, + effectiveRate: 10, + }); + + const failures = await processRecords([record1, record2, record3]); + + expect(mockDeliverPayload).toHaveBeenCalledTimes(1); + expect(failures).toEqual([ + { itemIdentifier: "msg-2" }, + { itemIdentifier: "msg-3" }, + ]); }); - it("an unexpected error on one record does not prevent subsequent records being processed", async () => { + it("an unexpected delivery error does not prevent other records in the batch", async () => { const record1 = makeRecord({ messageId: "msg-1" }); const record2 = makeRecord({ messageId: "msg-2" }); - mockLoadTargetConfig - .mockRejectedValueOnce(new Error("S3 unavailable")) - .mockResolvedValueOnce(DEFAULT_TARGET); + mockDeliverPayload + .mockRejectedValueOnce(new Error("Connection reset")) + .mockResolvedValueOnce({ outcome: "success" }); const failures = await processRecords([record1, record2]); expect(failures).toEqual([{ itemIdentifier: "msg-1" }]); - expect(mockDeliverPayload).toHaveBeenCalledTimes(1); expect(mockChangeVisibility).toHaveBeenCalledWith("receipt-1", 5); }); it("applies jittered backoff cooldown on unexpected errors", async () => { - mockLoadTargetConfig.mockRejectedValue(new Error("Infrastructure error")); + mockDeliverPayload.mockRejectedValue(new Error("Infrastructure error")); const failures = await processRecords([makeRecord()]); @@ -217,7 +239,7 @@ describe("processRecords", () => { expect(mockChangeVisibility).toHaveBeenCalledWith("receipt-1", 5); }); - it("does not apply a second visibility change for admission-denied (managed path)", async () => { + it("changes visibility once per record for admission-denied batch", async () => { mockAdmit.mockResolvedValue({ allowed: false, reason: "rate_limited", @@ -230,7 +252,7 @@ describe("processRecords", () => { expect(mockChangeVisibility).toHaveBeenCalledTimes(1); }); - it("does not apply a second visibility change for transient failure (managed path)", async () => { + it("changes visibility once for transient failure", async () => { mockDeliverPayload.mockResolvedValue({ outcome: "transient_failure", statusCode: 503, @@ -241,13 +263,13 @@ describe("processRecords", () => { expect(mockChangeVisibility).toHaveBeenCalledTimes(1); }); - it("returns failure when CLIENT_ID is not set", async () => { + it("throws when CLIENT_ID is not set", async () => { const saved = process.env.CLIENT_ID; delete process.env.CLIENT_ID; - const failures = await processRecords([makeRecord()]); - - expect(failures).toEqual([{ itemIdentifier: "msg-1" }]); + await expect(processRecords([makeRecord()])).rejects.toThrow( + "CLIENT_ID is required", + ); process.env.CLIENT_ID = saved; }); @@ -262,7 +284,7 @@ describe("processRecords", () => { expect(mockDeliverPayload).not.toHaveBeenCalled(); }); - it("calls changeVisibility with backoff on 5xx then throws", async () => { + it("calls changeVisibility with backoff on 5xx", async () => { mockDeliverPayload.mockResolvedValue({ outcome: "transient_failure", statusCode: 503, @@ -303,7 +325,7 @@ describe("processRecords", () => { expect(failures).toEqual([]); }); - it("requeues when rate limited by endpoint gate", async () => { + it("requeues all records when rate limited by endpoint gate", async () => { mockAdmit.mockResolvedValue({ allowed: false, reason: "rate_limited", @@ -319,7 +341,7 @@ describe("processRecords", () => { expect(mockDeliverPayload).not.toHaveBeenCalled(); }); - it("requeues when circuit is open", async () => { + it("requeues all records when circuit is open", async () => { mockAdmit.mockResolvedValue({ allowed: false, reason: "circuit_open", @@ -350,17 +372,23 @@ describe("processRecords", () => { "target-1", 10, false, + 1, expect.any(Object), ); expect(mockDeliverPayload).toHaveBeenCalled(); }); - it("calls recordResult(true) on successful delivery when CB enabled", async () => { + it("calls recordResult with batch counts on successful delivery when CB enabled", async () => { const targetCb = { ...DEFAULT_TARGET, delivery: { circuitBreaker: { enabled: true } }, }; mockLoadTargetConfig.mockResolvedValue(targetCb); + mockAdmit.mockResolvedValue({ + allowed: true, + consumedTokens: 1, + effectiveRate: 10, + }); const failures = await processRecords([makeRecord()]); @@ -368,17 +396,23 @@ describe("processRecords", () => { expect(mockRecordResult).toHaveBeenCalledWith( expect.anything(), "target-1", - true, + 1, + 0, expect.any(Object), ); }); - it("calls recordResult(false) on 5xx before visibility change", async () => { + it("calls recordResult with failure count on 5xx when CB enabled", async () => { const targetCb = { ...DEFAULT_TARGET, delivery: { circuitBreaker: { enabled: true } }, }; mockLoadTargetConfig.mockResolvedValue(targetCb); + mockAdmit.mockResolvedValue({ + allowed: true, + consumedTokens: 1, + effectiveRate: 10, + }); mockDeliverPayload.mockResolvedValue({ outcome: "transient_failure", statusCode: 503, @@ -390,13 +424,14 @@ describe("processRecords", () => { expect(mockRecordResult).toHaveBeenCalledWith( expect.anything(), "target-1", - false, + 1, + 1, expect.any(Object), ); expect(mockChangeVisibility).toHaveBeenCalled(); }); - it("does not call recordResult on rate-limited path", async () => { + it("does not call recordResult on gate admission-denied path", async () => { mockAdmit.mockResolvedValue({ allowed: false, reason: "rate_limited", @@ -409,17 +444,6 @@ describe("processRecords", () => { expect(mockRecordResult).not.toHaveBeenCalled(); }); - it("does not call recordResult on 429 path", async () => { - mockDeliverPayload.mockResolvedValue({ - outcome: "rate_limited", - retryAfterHeader: "60", - }); - - await processRecords([makeRecord()]); - - expect(mockRecordResult).not.toHaveBeenCalled(); - }); - it("does not call recordResult when CB is disabled on transient failure", async () => { const targetNoCb = { ...DEFAULT_TARGET, @@ -449,7 +473,7 @@ describe("processRecords", () => { expect(mockRecordResult).not.toHaveBeenCalled(); }); - it("emits CircuitBreakerOpen metric when recordResult returns opened", async () => { + it("records CircuitBreakerOpen when recordResult returns opened", async () => { const targetCb = { ...DEFAULT_TARGET, delivery: { circuitBreaker: { enabled: true } }, @@ -461,16 +485,16 @@ describe("processRecords", () => { }); mockRecordResult.mockResolvedValue({ ok: false, state: "opened" }); - const { emitCircuitBreakerOpen } = jest.requireMock( - "services/delivery-metrics", + const { recordCircuitBreakerOpen } = jest.requireMock( + "services/delivery-observability", ); await processRecords([makeRecord()]); - expect(emitCircuitBreakerOpen).toHaveBeenCalledWith("target-1"); + expect(recordCircuitBreakerOpen).toHaveBeenCalledWith("target-1"); }); - it("does not emit CircuitBreakerOpen when recordResult returns failed", async () => { + it("does not record CircuitBreakerOpen when recordResult returns failed", async () => { const targetCb = { ...DEFAULT_TARGET, delivery: { circuitBreaker: { enabled: true } }, @@ -482,16 +506,16 @@ describe("processRecords", () => { }); mockRecordResult.mockResolvedValue({ ok: false, state: "failed" }); - const { emitCircuitBreakerOpen } = jest.requireMock( - "services/delivery-metrics", + const { recordCircuitBreakerOpen } = jest.requireMock( + "services/delivery-observability", ); await processRecords([makeRecord()]); - expect(emitCircuitBreakerOpen).not.toHaveBeenCalled(); + expect(recordCircuitBreakerOpen).not.toHaveBeenCalled(); }); - it("does not emit CircuitBreakerOpen when recordResult returns closed", async () => { + it("does not record CircuitBreakerOpen when recordResult returns closed", async () => { const targetCb = { ...DEFAULT_TARGET, delivery: { circuitBreaker: { enabled: true } }, @@ -503,26 +527,32 @@ describe("processRecords", () => { }); mockRecordResult.mockResolvedValue({ ok: true, state: "closed" }); - const { emitCircuitBreakerOpen } = jest.requireMock( - "services/delivery-metrics", + const { recordCircuitBreakerOpen } = jest.requireMock( + "services/delivery-observability", ); await processRecords([makeRecord()]); - expect(emitCircuitBreakerOpen).not.toHaveBeenCalled(); + expect(recordCircuitBreakerOpen).not.toHaveBeenCalled(); }); - it("emits RateLimited metric on 429 response", async () => { + it("records RateLimited on 429 response", async () => { mockDeliverPayload.mockResolvedValue({ outcome: "rate_limited", retryAfterHeader: "60", }); - const { emitRateLimited } = jest.requireMock("services/delivery-metrics"); + const { recordDeliveryRateLimited } = jest.requireMock( + "services/delivery-observability", + ); await processRecords([makeRecord()]); - expect(emitRateLimited).toHaveBeenCalledWith("target-1"); + expect(recordDeliveryRateLimited).toHaveBeenCalledWith( + "client-1", + "target-1", + "test-message-id", + ); }); it("uses configured maxRetryDurationSeconds when set on target", async () => { @@ -558,4 +588,30 @@ describe("processRecords", () => { 7_200_000, ); }); + + it("groups records by target and processes each batch separately", async () => { + const record1 = makeRecord({ messageId: "msg-1" }); + const record2 = makeRecord({ + messageId: "msg-2", + body: JSON.stringify({ + payload: { + data: [ + { + type: "MessageStatus", + attributes: { messageStatus: "delivered" }, + }, + ], + }, + subscriptionId: "sub-2", + targetId: "target-2", + }), + }); + + const failures = await processRecords([record1, record2]); + + expect(failures).toEqual([]); + expect(mockAdmit).toHaveBeenCalledTimes(2); + expect(mockLoadTargetConfig).toHaveBeenCalledWith("client-1", "target-1"); + expect(mockLoadTargetConfig).toHaveBeenCalledWith("client-1", "target-2"); + }); }); diff --git a/lambdas/https-client-lambda/src/__tests__/record-result-lua.test.ts b/lambdas/https-client-lambda/src/__tests__/record-result-lua.test.ts index 515f1377..5cc407fe 100644 --- a/lambdas/https-client-lambda/src/__tests__/record-result-lua.test.ts +++ b/lambdas/https-client-lambda/src/__tests__/record-result-lua.test.ts @@ -1,28 +1,30 @@ import recordResultLuaSrc from "services/record-result.lua"; import { createRedisStore, evalLua } from "__tests__/helpers/lua-redis-mock"; -// ARGV: [now, success, cooldownMs, decayPeriodMs, cbErrorThreshold, cbMinAttempts, cbWindowPeriodMs] -// KEYS: [cbKey] +// ARGV: [now, consumedTokens, processingFailures, cooldownPeriodMs, recoveryPeriodMs, failureThreshold, minAttempts, samplePeriodMs] +// KEYS: [epKey] // Returns: [ok (0|1), state] state: "closed" | "opened" | "failed" type RecordResultArgs = { now: number; - success: boolean; - cooldownMs: number; - decayPeriodMs: number; - cbErrorThreshold: number; - cbMinAttempts: number; - cbWindowPeriodMs: number; + consumedTokens: number; + processingFailures: number; + cooldownPeriodMs: number; + recoveryPeriodMs: number; + failureThreshold: number; + minAttempts: number; + samplePeriodMs: number; }; const defaultArgs: RecordResultArgs = { now: 1_000_000, - success: true, - cooldownMs: 60_000, - decayPeriodMs: 300_000, - cbErrorThreshold: 0.5, - cbMinAttempts: 10, - cbWindowPeriodMs: 60_000, + consumedTokens: 1, + processingFailures: 0, + cooldownPeriodMs: 120_000, + recoveryPeriodMs: 600_000, + failureThreshold: 0.3, + minAttempts: 5, + samplePeriodMs: 300_000, }; type RecordResultResult = [number, string]; @@ -35,15 +37,16 @@ function runRecordResult( const merged = { ...defaultArgs, ...args }; return evalLua( recordResultLuaSrc, - [`cb:${targetId}`], + [`ep:${targetId}`], [ merged.now.toString(), - merged.success ? "1" : "0", - merged.cooldownMs.toString(), - merged.decayPeriodMs.toString(), - merged.cbErrorThreshold.toString(), - merged.cbMinAttempts.toString(), - merged.cbWindowPeriodMs.toString(), + merged.consumedTokens.toString(), + merged.processingFailures.toString(), + merged.cooldownPeriodMs.toString(), + merged.recoveryPeriodMs.toString(), + merged.failureThreshold.toString(), + merged.minAttempts.toString(), + merged.samplePeriodMs.toString(), ], store, ) as RecordResultResult; @@ -51,79 +54,122 @@ function runRecordResult( describe("record-result.lua", () => { describe("success recording", () => { - it("returns closed state for a successful result", () => { + it("returns closed state for a successful batch", () => { const store = createRedisStore(); - const [ok, state] = runRecordResult(store, { success: true }); + store.set("ep:t1", new Map([["sample_till", "9999999999"]])); + + const [ok, state] = runRecordResult(store, { + consumedTokens: 5, + processingFailures: 0, + }); expect(ok).toBe(1); expect(state).toBe("closed"); }); - it("increments attempt count without incrementing failures", () => { + it("increments cur_attempts without incrementing cur_failures", () => { const store = createRedisStore(); - runRecordResult(store, { success: true }); + store.set("ep:t1", new Map([["sample_till", "9999999999"]])); - const cbHash = store.get("cb:t1")!; - expect(cbHash.get("cb_attempts")).toBe("1"); - expect(cbHash.get("cb_failures")).toBe("0"); + runRecordResult(store, { consumedTokens: 3, processingFailures: 0 }); + + const epHash = store.get("ep:t1")!; + expect(epHash.get("cur_attempts")).toBe("3"); + expect(epHash.get("cur_failures")).toBe("0"); }); }); describe("failure recording", () => { - it("increments both attempts and failures on error", () => { + it("increments both cur_attempts and cur_failures", () => { const store = createRedisStore(); - runRecordResult(store, { success: false }); + store.set("ep:t1", new Map([["sample_till", "9999999999"]])); + + runRecordResult(store, { consumedTokens: 5, processingFailures: 1 }); - const cbHash = store.get("cb:t1")!; - expect(cbHash.get("cb_attempts")).toBe("1"); - expect(cbHash.get("cb_failures")).toBe("1"); + const epHash = store.get("ep:t1")!; + expect(epHash.get("cur_attempts")).toBe("5"); + expect(epHash.get("cur_failures")).toBe("1"); }); - it("returns failed state for a single failure below threshold", () => { + it("returns failed state for failures below threshold", () => { const store = createRedisStore(); - const [ok, state] = runRecordResult(store, { success: false }); + store.set("ep:t1", new Map([["sample_till", "9999999999"]])); + + const [ok, state] = runRecordResult(store, { + consumedTokens: 1, + processingFailures: 1, + }); expect(ok).toBe(0); expect(state).toBe("failed"); }); + }); - it("stays closed when below error threshold", () => { + describe("recording guard — fully open", () => { + it("does not record attempts/failures when circuit is fully open", () => { const store = createRedisStore(); const now = 1_000_000; + const switchedAt = now - 10_000; + + store.set( + "ep:t1", + new Map([ + ["is_open", "1"], + ["switched_at", switchedAt.toString()], + ["sample_till", "9999999999"], + ["cur_attempts", "0"], + ["cur_failures", "0"], + ]), + ); - for (let i = 0; i < 8; i++) { - runRecordResult(store, { now, success: true }); - } - for (let i = 0; i < 2; i++) { - runRecordResult(store, { now, success: false }); - } + runRecordResult(store, { + now, + cooldownPeriodMs: 120_000, + consumedTokens: 5, + processingFailures: 3, + }); - const [ok, state] = runRecordResult(store, { now, success: true }); - expect(ok).toBe(1); - expect(state).toBe("closed"); + const epHash = store.get("ep:t1")!; + expect(epHash.get("cur_attempts")).toBe("0"); + expect(epHash.get("cur_failures")).toBe("0"); }); - }); - describe("circuit opening", () => { - it("opens circuit when error rate exceeds threshold", () => { + it("returns failed when circuit is fully open and state unchanged", () => { const store = createRedisStore(); const now = 1_000_000; + const switchedAt = now - 10_000; - for (let i = 0; i < 4; i++) { - const [, state] = runRecordResult(store, { - now, - success: false, - cbMinAttempts: 5, - cbErrorThreshold: 0.5, - }); - expect(state).toBe("failed"); - } + store.set( + "ep:t1", + new Map([ + ["is_open", "1"], + ["switched_at", switchedAt.toString()], + ["sample_till", "9999999999"], + ]), + ); const [ok, state] = runRecordResult(store, { now, - success: false, - cbMinAttempts: 5, - cbErrorThreshold: 0.5, + cooldownPeriodMs: 120_000, + consumedTokens: 1, + processingFailures: 0, + }); + + expect(ok).toBe(0); + expect(state).toBe("failed"); + }); + }); + + describe("circuit opening", () => { + it("opens circuit when failure rate exceeds threshold", () => { + const store = createRedisStore(); + store.set("ep:t1", new Map([["sample_till", "9999999999"]])); + + const [ok, state] = runRecordResult(store, { + consumedTokens: 5, + processingFailures: 5, + minAttempts: 5, + failureThreshold: 0.3, }); expect(ok).toBe(0); expect(state).toBe("opened"); @@ -131,243 +177,213 @@ describe("record-result.lua", () => { it("does not open circuit when below minimum attempts", () => { const store = createRedisStore(); - const now = 1_000_000; - - for (let i = 0; i < 4; i++) { - runRecordResult(store, { - now, - success: false, - cbMinAttempts: 10, - }); - } + store.set("ep:t1", new Map([["sample_till", "9999999999"]])); const [ok, state] = runRecordResult(store, { - now, - success: false, - cbMinAttempts: 10, + consumedTokens: 3, + processingFailures: 3, + minAttempts: 5, + failureThreshold: 0.3, }); expect(ok).toBe(0); expect(state).toBe("failed"); }); - it("sets opened_until_ms with cooldown on open", () => { + it("sets is_open and switched_at on open", () => { const store = createRedisStore(); const now = 1_000_000; - const cooldownMs = 30_000; - - for (let i = 0; i < 5; i++) { - runRecordResult(store, { - now, - success: false, - cbMinAttempts: 5, - cbErrorThreshold: 0.5, - cooldownMs, - }); - } - - const cbHash = store.get("cb:t1")!; - expect(Number(cbHash.get("opened_until_ms"))).toBe(now + cooldownMs); - }); + store.set("ep:t1", new Map([["sample_till", "9999999999"]])); - it("resets all counters on open", () => { - const store = createRedisStore(); - const now = 1_000_000; + runRecordResult(store, { + now, + consumedTokens: 5, + processingFailures: 5, + minAttempts: 5, + failureThreshold: 0.3, + }); - for (let i = 0; i < 5; i++) { - runRecordResult(store, { - now, - success: false, - cbMinAttempts: 5, - cbErrorThreshold: 0.5, - }); - } - - const cbHash = store.get("cb:t1")!; - expect(cbHash.get("cb_failures")).toBe("0"); - expect(cbHash.get("cb_attempts")).toBe("0"); - expect(cbHash.get("cb_window_from")).toBe("0"); - expect(cbHash.get("cb_prev_failures")).toBe("0"); - expect(cbHash.get("cb_prev_attempts")).toBe("0"); + const epHash = store.get("ep:t1")!; + expect(epHash.get("is_open")).toBe("1"); + expect(Number(epHash.get("switched_at"))).toBe(now); }); - it("does not double-trip when circuit is already open", () => { + it("resets all counters and sets sampleTill on open", () => { const store = createRedisStore(); const now = 1_000_000; - const openedUntil = now + 60_000; + const samplePeriodMs = 300_000; + store.set("ep:t1", new Map([["sample_till", "9999999999"]])); - store.set( - "cb:t1", - new Map([ - ["opened_until_ms", openedUntil.toString()], - ["cb_window_from", now.toString()], - ]), - ); + runRecordResult(store, { + now, + consumedTokens: 5, + processingFailures: 5, + minAttempts: 5, + failureThreshold: 0.3, + samplePeriodMs, + }); - for (let i = 0; i < 20; i++) { - const [, state] = runRecordResult(store, { - now, - success: false, - cbMinAttempts: 5, - cbErrorThreshold: 0.5, - }); - expect(state).toBe("failed"); - } - - const cbHash = store.get("cb:t1")!; - expect(Number(cbHash.get("opened_until_ms"))).toBe(openedUntil); + const epHash = store.get("ep:t1")!; + expect(epHash.get("cur_failures")).toBe("0"); + expect(epHash.get("cur_attempts")).toBe("0"); + expect(epHash.get("prev_failures")).toBe("0"); + expect(epHash.get("prev_attempts")).toBe("0"); + expect(Number(epHash.get("sample_till"))).toBe(now + samplePeriodMs); }); }); - describe("two-window blended rate", () => { - it("blends previous window failures into current assessment", () => { + describe("circuit closing — half-open with successes", () => { + it("closes circuit when half-open and batch has successes", () => { const store = createRedisStore(); const now = 1_000_000; - const cbWindowPeriodMs = 60_000; + const switchedAt = now - 130_000; store.set( - "cb:t1", + "ep:t1", new Map([ - ["cb_window_from", now.toString()], - ["cb_prev_failures", "8"], - ["cb_prev_attempts", "10"], + ["is_open", "1"], + ["switched_at", switchedAt.toString()], + ["sample_till", "9999999999"], ]), ); const [ok, state] = runRecordResult(store, { now, - success: false, - cbWindowPeriodMs, - cbMinAttempts: 5, - cbErrorThreshold: 0.5, + cooldownPeriodMs: 120_000, + consumedTokens: 1, + processingFailures: 0, }); - expect(ok).toBe(0); - expect(state).toBe("opened"); - }); - it("reduces previous window weight as current window ages", () => { - const store = createRedisStore(); - const cbWindowPeriodMs = 100_000; - const t0 = 1_000_000; - const nearEnd = t0 + cbWindowPeriodMs - 1; - - store.set( - "cb:t1", - new Map([ - ["cb_window_from", t0.toString()], - ["cb_prev_failures", "10"], - ["cb_prev_attempts", "10"], - ]), - ); + expect(ok).toBe(1); + expect(state).toBe("closed"); - for (let i = 0; i < 20; i++) { - runRecordResult(store, { - now: nearEnd, - success: true, - cbWindowPeriodMs, - cbMinAttempts: 5, - cbErrorThreshold: 0.5, - }); - } - - const [, state] = runRecordResult(store, { - now: nearEnd, - success: false, - cbWindowPeriodMs, - cbMinAttempts: 5, - cbErrorThreshold: 0.5, - }); - expect(state).toBe("failed"); + const epHash = store.get("ep:t1")!; + expect(epHash.get("is_open")).toBe("0"); + expect(Number(epHash.get("switched_at"))).toBe(now); }); - it("ignores previous window when cbWindowPeriodMs is 0", () => { + it("does not close when half-open but all attempts failed", () => { const store = createRedisStore(); const now = 1_000_000; + const switchedAt = now - 130_000; store.set( - "cb:t1", + "ep:t1", new Map([ - ["cb_window_from", now.toString()], - ["cb_prev_failures", "100"], - ["cb_prev_attempts", "100"], + ["is_open", "1"], + ["switched_at", switchedAt.toString()], + ["sample_till", "9999999999"], ]), ); - const [, state] = runRecordResult(store, { + const [ok, state] = runRecordResult(store, { now, - success: false, - cbWindowPeriodMs: 0, - cbMinAttempts: 5, - cbErrorThreshold: 0.5, + cooldownPeriodMs: 120_000, + consumedTokens: 1, + processingFailures: 1, }); + + expect(ok).toBe(0); expect(state).toBe("failed"); }); }); - describe("decay period", () => { - it("preserves opened_until_ms during active decay", () => { + describe("sliding window management", () => { + it("promotes current to previous when sampleTill expires", () => { const store = createRedisStore(); - const openedUntil = 1_060_000; - const duringDecay = openedUntil + 100_000; + const now = 1_000_000; + const samplePeriodMs = 300_000; + const sampleTill = now - 1; store.set( - "cb:t1", - new Map([["opened_until_ms", openedUntil.toString()]]), + "ep:t1", + new Map([ + ["sample_till", sampleTill.toString()], + ["cur_attempts", "10"], + ["cur_failures", "3"], + ["prev_attempts", "0"], + ["prev_failures", "0"], + ]), ); - runRecordResult(store, { - now: duringDecay, - success: true, - decayPeriodMs: 300_000, - }); + runRecordResult(store, { now, samplePeriodMs, consumedTokens: 1 }); - const cbHash = store.get("cb:t1")!; - expect(Number(cbHash.get("opened_until_ms"))).toBe(openedUntil); + const epHash = store.get("ep:t1")!; + expect(epHash.get("prev_attempts")).toBe("10"); + expect(epHash.get("prev_failures")).toBe("3"); + expect(Number(epHash.get("sample_till"))).toBe( + sampleTill + samplePeriodMs, + ); }); - it("clears opened_until_ms after decay period elapses", () => { + it("complete reset when window is too old", () => { const store = createRedisStore(); - const openedUntil = 1_060_000; - const decayPeriodMs = 300_000; - const afterDecay = openedUntil + decayPeriodMs + 1; + const now = 1_000_000; + const samplePeriodMs = 300_000; + const sampleTill = now - samplePeriodMs - 1; store.set( - "cb:t1", - new Map([["opened_until_ms", openedUntil.toString()]]), + "ep:t1", + new Map([ + ["sample_till", sampleTill.toString()], + ["cur_attempts", "10"], + ["cur_failures", "3"], + ["prev_attempts", "5"], + ["prev_failures", "2"], + ]), ); - runRecordResult(store, { - now: afterDecay, - success: true, - decayPeriodMs, - }); + runRecordResult(store, { now, samplePeriodMs, consumedTokens: 1 }); - const cbHash = store.get("cb:t1")!; - expect(cbHash.get("opened_until_ms")).toBe("0"); + const epHash = store.get("ep:t1")!; + expect(epHash.get("prev_attempts")).toBe("0"); + expect(epHash.get("prev_failures")).toBe("0"); + expect(Number(epHash.get("sample_till"))).toBe(now + samplePeriodMs); }); - it("clears opened_until_ms when circuit was never opened", () => { + it("interpolates using weight from sampleTill", () => { const store = createRedisStore(); + const samplePeriodMs = 300_000; const now = 1_000_000; + const sampleTill = now + samplePeriodMs; - runRecordResult(store, { now, success: true }); + store.set( + "ep:t1", + new Map([ + ["sample_till", sampleTill.toString()], + ["prev_attempts", "10"], + ["prev_failures", "10"], + ]), + ); - const cbHash = store.get("cb:t1")!; - expect(cbHash.get("opened_until_ms")).toBe("0"); + // weight = (sampleTill - now) / samplePeriodMs = 1.0 + // interpolated attempts = 10 * 1.0 + 5 = 15 (>= minAttempts 5) + // interpolated failures = 10 * 1.0 + 5 = 15 + // failure rate = 15/15 = 1.0 > 0.3 → opens + const [ok, state] = runRecordResult(store, { + now, + samplePeriodMs, + consumedTokens: 5, + processingFailures: 5, + minAttempts: 5, + failureThreshold: 0.3, + }); + expect(ok).toBe(0); + expect(state).toBe("opened"); }); }); describe("state persistence", () => { - it("writes all counter fields to redis", () => { + it("writes all sampling fields to redis", () => { const store = createRedisStore(); + store.set("ep:t1", new Map([["sample_till", "9999999999"]])); runRecordResult(store); - const cbHash = store.get("cb:t1")!; - expect(cbHash.has("opened_until_ms")).toBe(true); - expect(cbHash.has("cb_window_from")).toBe(true); - expect(cbHash.has("cb_failures")).toBe(true); - expect(cbHash.has("cb_attempts")).toBe(true); - expect(cbHash.has("cb_prev_failures")).toBe(true); - expect(cbHash.has("cb_prev_attempts")).toBe(true); + const epHash = store.get("ep:t1")!; + expect(epHash.has("cur_attempts")).toBe(true); + expect(epHash.has("cur_failures")).toBe(true); + expect(epHash.has("prev_attempts")).toBe(true); + expect(epHash.has("prev_failures")).toBe(true); + expect(epHash.has("sample_till")).toBe(true); }); }); }); diff --git a/lambdas/https-client-lambda/src/handler.ts b/lambdas/https-client-lambda/src/handler.ts index 28fcc6b9..764e7397 100644 --- a/lambdas/https-client-lambda/src/handler.ts +++ b/lambdas/https-client-lambda/src/handler.ts @@ -12,7 +12,6 @@ import { OUTCOME_SUCCESS, deliverPayload, } from "services/delivery/https-client"; -import type { DeliveryResult } from "services/delivery/https-client"; import { sendToDlq } from "services/dlq-sender"; import { changeVisibility } from "services/sqs-visibility"; import { @@ -26,7 +25,6 @@ import { recordResult, } from "services/endpoint-gate"; import { getRedisClient } from "services/redis-client"; -import { VisibilityManagedError } from "services/visibility-managed-error"; import { recordAdmissionDenied, recordCircuitBreakerClosed, @@ -47,13 +45,20 @@ const DEFAULT_MAX_RETRY_DURATION_MS = 7_200_000; // 2 hours const DEFAULT_CONCURRENCY_LIMIT = 5; const gateConfig: EndpointGateConfig = { - burstCapacity: Number(process.env.TOKEN_BUCKET_BURST_CAPACITY ?? "10"), - cbProbeIntervalMs: Number(process.env.CB_PROBE_INTERVAL_MS ?? "60000"), - decayPeriodMs: Number(process.env.CB_DECAY_PERIOD_MS ?? "300000"), - cbWindowPeriodMs: Number(process.env.CB_WINDOW_PERIOD_MS ?? "60000"), - cbErrorThreshold: Number(process.env.CB_ERROR_THRESHOLD ?? "0.5"), - cbMinAttempts: Number(process.env.CB_MIN_ATTEMPTS ?? "10"), - cbCooldownMs: Number(process.env.CB_COOLDOWN_MS ?? "60000"), + // Max tokens the bucket can hold — absorbs short traffic bursts without throttling (default: 2250) + burstCapacity: Number(process.env.TOKEN_BUCKET_BURST_CAPACITY ?? "2250"), + // Probe rate to test endpoint recovery when half-open (default: 1/60 req/s) + probeRateLimit: Number(process.env.CB_PROBE_RATE_LIMIT ?? String(1 / 60)), + // Linear ramp-up after circuit closes, avoids flooding a freshly recovered endpoint (default: 10 min) + recoveryPeriodMs: Number(process.env.CB_RECOVERY_PERIOD_MS ?? "600000"), + // Sliding window over which failure rates are sampled (default: 5 min) + samplePeriodMs: Number(process.env.CB_SAMPLE_PERIOD_MS ?? "300000"), + // Failure rate within the sample window that triggers circuit open (default: 30%) + failureThreshold: Number(process.env.CB_FAILURE_THRESHOLD ?? "0.3"), + // Minimum attempts in the sample window before the failure rate is evaluated (default: 5 attempts) + minAttempts: Number(process.env.CB_MIN_ATTEMPTS ?? "5"), + // Full block after circuit opens, before half-open probes begin (default: 2 min) + cooldownPeriodMs: Number(process.env.CB_COOLDOWN_PERIOD_MS ?? "120000"), }; type CallbackDeliveryMessage = { @@ -62,223 +67,252 @@ type CallbackDeliveryMessage = { targetId: string; }; -async function checkAdmission( - redis: RedisClientType, - targetId: string, - invocationRateLimit: number, - cbEnabled: boolean, - clientId: string, - record: SQSRecord, - correlationId?: string, -): Promise { - const gateResult = await admit( - redis, - targetId, - invocationRateLimit, - cbEnabled, - gateConfig, - ); +type TargetBatch = { + targetId: string; + records: SQSRecord[]; + messages: CallbackDeliveryMessage[]; +}; - if (!gateResult.allowed) { - const delaySec = Math.ceil(gateResult.retryAfterMs / 1000); - recordAdmissionDenied(clientId, targetId, gateResult.reason, correlationId); - await changeVisibility(record.receiptHandle, delaySec); - throw new VisibilityManagedError(`Admission denied: ${gateResult.reason}`); +function groupByTarget(records: SQSRecord[]): TargetBatch[] { + const groups = new Map< + string, + { records: SQSRecord[]; messages: CallbackDeliveryMessage[] } + >(); + + for (const record of records) { + const message: CallbackDeliveryMessage = JSON.parse(record.body); + const existing = groups.get(message.targetId); + if (existing) { + existing.records.push(record); + existing.messages.push(message); + } else { + groups.set(message.targetId, { records: [record], messages: [message] }); + } } -} -const OUTCOME_DELIVERED = "delivered" as const; -const OUTCOME_DLQ = "dlq" as const; -type RecordOutcome = typeof OUTCOME_DELIVERED | typeof OUTCOME_DLQ; + return [...groups.entries()].map( + ([targetId, { messages, records: recs }]) => ({ + targetId, + records: recs, + messages, + }), + ); +} -async function handleDeliveryResult( - result: DeliveryResult, +async function deliverRecord( record: SQSRecord, - redis: RedisClientType, + message: CallbackDeliveryMessage, + target: Awaited>, + applicationId: string, clientId: string, - targetId: string, - cbEnabled: boolean, - correlationId?: string, -): Promise { +): Promise<{ success: boolean }> { + const correlationId = message.payload.data[0]?.attributes?.messageId; + + const maxRetryDurationMs = + target.delivery?.maxRetryDurationSeconds === undefined + ? DEFAULT_MAX_RETRY_DURATION_MS + : target.delivery.maxRetryDurationSeconds * 1000; + + const firstReceivedMs = Number( + record.attributes.ApproximateFirstReceiveTimestamp, + ); + + if (isWindowExhausted(firstReceivedMs, maxRetryDurationMs)) { + recordRetryWindowExhausted(clientId, message.targetId, correlationId); + await sendToDlq(record.body); + return { success: true }; + } + + const agent = await buildAgent(target); + const signature = signPayload( + applicationId, + target.apiKey.headerValue, + message.payload, + ); + const payloadJson = JSON.stringify(message.payload); + + recordDeliveryAttempt(clientId, message.targetId, correlationId); + const deliveryStart = Date.now(); + const result = await deliverPayload(target, payloadJson, signature, agent); + recordDeliveryDuration(message.targetId, Date.now() - deliveryStart); + if (result.outcome === OUTCOME_SUCCESS) { - if (cbEnabled) { - const cbOutcome = await recordResult(redis, targetId, true, gateConfig); - if (cbOutcome.ok && cbOutcome.state === "closed") { - recordCircuitBreakerClosed(targetId, correlationId); - } - } - recordDeliverySuccess(clientId, targetId, correlationId); - return OUTCOME_DELIVERED; + recordDeliverySuccess(clientId, message.targetId, correlationId); + return { success: true }; } if (result.outcome === OUTCOME_PERMANENT_FAILURE) { recordDeliveryPermanentFailure( clientId, - targetId, + message.targetId, result.statusCode, result.errorCode, correlationId, ); await sendToDlq(record.body, result); - return OUTCOME_DLQ; + return { success: true }; } if (result.outcome === OUTCOME_RATE_LIMITED) { const receiveCount = Number(record.attributes.ApproximateReceiveCount); - recordDeliveryRateLimited(clientId, targetId, correlationId); + recordDeliveryRateLimited(clientId, message.targetId, correlationId); await handleRateLimitedRecord( record, clientId, - targetId, + message.targetId, result.retryAfterHeader, receiveCount, ); - return OUTCOME_DELIVERED; // unreachable — handleRateLimitedRecord always throws + return { success: true }; } const receiveCount = Number(record.attributes.ApproximateReceiveCount); const backoffSec = jitteredBackoffSeconds(receiveCount); - if (cbEnabled) { - const cbOutcome = await recordResult(redis, targetId, false, gateConfig); - if (cbOutcome.state === "opened") { - recordCircuitBreakerOpen(targetId, correlationId); - } - } recordDeliveryFailure( clientId, - targetId, + message.targetId, result.statusCode, backoffSec, receiveCount, correlationId, ); await changeVisibility(record.receiptHandle, backoffSec); - throw new VisibilityManagedError(`Transient failure: ${result.statusCode}`); + return { success: false }; } -async function processRecord( - record: SQSRecord, +async function processTargetBatch( + batch: TargetBatch, redis: RedisClientType, -): Promise { - const { CLIENT_ID } = process.env; - if (!CLIENT_ID) { - throw new Error("CLIENT_ID is required"); + clientId: string, + concurrencyLimit: number, +): Promise { + const target = await loadTargetConfig(clientId, batch.targetId); + const cbEnabled = target.delivery?.circuitBreaker?.enabled ?? false; + + const gateResult = await admit( + redis, + batch.targetId, + target.invocationRateLimit, + cbEnabled, + batch.records.length, + gateConfig, + ); + + if (!gateResult.allowed) { + const delaySec = Math.ceil(gateResult.retryAfterMs / 1000); + recordAdmissionDenied(clientId, batch.targetId, gateResult.reason); + const failures: SQSBatchItemFailure[] = []; + for (const record of batch.records) { + await changeVisibility(record.receiptHandle, delaySec); + failures.push({ itemIdentifier: record.messageId }); + } + return failures; } - const message: CallbackDeliveryMessage = JSON.parse(record.body); - const { payload, targetId } = message; - const messageId = payload.data[0]?.attributes?.messageId; + const { consumedTokens } = gateResult; + const admitted = batch.records.slice(0, consumedTokens); + const rejected = batch.records.slice(consumedTokens); + const admittedMessages = batch.messages.slice(0, consumedTokens); - logger.info("Processing delivery", { - clientId: CLIENT_ID, - targetId, - messageId, - sqsMessageId: record.messageId, - receiveCount: record.attributes.ApproximateReceiveCount, - }); + const applicationId = await getApplicationId(clientId); - const target = await loadTargetConfig(CLIENT_ID, targetId); - const maxRetryDurationMs = - target.delivery?.maxRetryDurationSeconds === undefined - ? DEFAULT_MAX_RETRY_DURATION_MS - : target.delivery.maxRetryDurationSeconds * 1000; + const failures: SQSBatchItemFailure[] = []; + let processingFailures = 0; - const firstReceivedMs = Number( - record.attributes.ApproximateFirstReceiveTimestamp, + const deliveryResults = await pMap( + admitted, + async (record, index): Promise<{ record: SQSRecord; success: boolean }> => { + try { + const outcome = await deliverRecord( + record, + admittedMessages[index], + target, + applicationId, + clientId, + ); + return { record, success: outcome.success }; + } catch (error) { + logger.error("Failed to process record", { + messageId: record.messageId, + err: error, + }); + const receiveCount = Number(record.attributes.ApproximateReceiveCount); + await changeVisibility( + record.receiptHandle, + jitteredBackoffSeconds(receiveCount), + ); + return { record, success: false }; + } + }, + { concurrency: concurrencyLimit }, ); - if (isWindowExhausted(firstReceivedMs, maxRetryDurationMs)) { - recordRetryWindowExhausted(CLIENT_ID, targetId, messageId); - await sendToDlq(record.body); - return OUTCOME_DLQ; + for (const { record, success } of deliveryResults) { + if (!success) { + processingFailures += 1; + failures.push({ itemIdentifier: record.messageId }); + } } - const applicationId = await getApplicationId(CLIENT_ID); - const cbEnabled = target.delivery?.circuitBreaker?.enabled ?? false; - - await checkAdmission( - redis, - targetId, - target.invocationRateLimit, - cbEnabled, - CLIENT_ID, - record, - messageId, - ); - - const agent = await buildAgent(target); - const signature = signPayload( - applicationId, - target.apiKey.headerValue, - payload, - ); - const payloadJson = JSON.stringify(payload); + if (cbEnabled && consumedTokens > 0) { + const cbOutcome = await recordResult( + redis, + batch.targetId, + consumedTokens, + processingFailures, + gateConfig, + ); + if (!cbOutcome.ok && cbOutcome.state === "opened") { + recordCircuitBreakerOpen(batch.targetId); + } + if (cbOutcome.ok && cbOutcome.state === "closed") { + recordCircuitBreakerClosed(batch.targetId); + } + } - recordDeliveryAttempt(CLIENT_ID, targetId, messageId); - const deliveryStart = Date.now(); - const result = await deliverPayload(target, payloadJson, signature, agent); - recordDeliveryDuration(targetId, Date.now() - deliveryStart); + for (const record of rejected) { + failures.push({ itemIdentifier: record.messageId }); + } - return handleDeliveryResult( - result, - record, - redis, - CLIENT_ID, - targetId, - cbEnabled, - messageId, - ); + return failures; } export async function processRecords( records: SQSRecord[], ): Promise { - resetMetrics(); + const { CLIENT_ID } = process.env; + if (!CLIENT_ID) { + throw new Error("CLIENT_ID is required"); + } - logger.info("Batch received", { batchSize: records.length }); + resetMetrics(); const concurrencyLimit = Number( process.env.CONCURRENCY_LIMIT ?? String(DEFAULT_CONCURRENCY_LIMIT), ); + logger.info("Batch received", { batchSize: records.length }); + const redis = await getRedisClient(); + const targetBatches = groupByTarget(records); - const results = await pMap( - records, - async (record): Promise => { - try { - return await processRecord(record, redis); - } catch (error) { - if (!(error instanceof VisibilityManagedError)) { - logger.error("Failed to process record", { - messageId: record.messageId, - err: error, - }); - const receiveCount = Number( - record.attributes.ApproximateReceiveCount, - ); - await changeVisibility( - record.receiptHandle, - jitteredBackoffSeconds(receiveCount), - ); - } - return { itemIdentifier: record.messageId }; - } - }, - { concurrency: concurrencyLimit }, - ); + const allFailures: SQSBatchItemFailure[] = []; + + for (const batch of targetBatches) { + const batchFailures = await processTargetBatch( + batch, + redis, + CLIENT_ID, + concurrencyLimit, + ); + allFailures.push(...batchFailures); + } - await flushMetrics(); - const failures = results.filter( - (r): r is SQSBatchItemFailure => typeof r === "object", - ); - const deliveredCount = results.filter((r) => r === OUTCOME_DELIVERED).length; - const dlqCount = results.filter((r) => r === OUTCOME_DLQ).length; logger.info("Batch complete", { batchSize: records.length, - deliveredCount, - dlqCount, - failureCount: failures.length, + failureCount: allFailures.length, }); - return failures; + + await flushMetrics(); + return allFailures; } diff --git a/lambdas/https-client-lambda/src/services/admit.lua b/lambdas/https-client-lambda/src/services/admit.lua index fd56decb..36809e40 100644 --- a/lambdas/https-client-lambda/src/services/admit.lua +++ b/lambdas/https-client-lambda/src/services/admit.lua @@ -1,203 +1,98 @@ --- admit.lua — Decides whether a request to an endpoint is allowed. +-- admit.lua — Pre-processing: determines rate limit and consumes tokens. -- --- Three sequential checks run atomically: --- 1. Circuit breaker — is the endpoint currently healthy? --- 2. Sliding window — roll the two-window error-rate accounting state if needed --- 3. Token bucket — is the endpoint within its rate limit? +-- Two sequential steps run atomically: +-- 1. Circuit breaker — determine effective rate from circuit state +-- 2. Token bucket — consume tokens for the target batch -- --- A request is allowed only when all three checks pass. +-- The circuit has four states: +-- Open (during cooldown): rate = 0, complete block, bucket untouched +-- Half-open (after cooldown): rate = probeRateLimit +-- Recovering (closed, during recovery period): linear ramp-up +-- Normal (closed): full configured rate -- --- While the circuit is open, a timed probe is let through at most once per --- cbProbeIntervalMs so the caller can test whether the endpoint has recovered. --- The probe bypasses the rate limit — counting it here would skew a --- low-volume probe signal against the recovery decision. --- --- After the circuit closes, the token fill rate ramps up linearly from --- near-zero to full over decayPeriodMs to avoid a thundering herd on recovery. --- --- Returns: { allowed (0|1), reason, retryAfterMs, effectiveRate } +-- Returns: { consumedTokens, reason, retryAfterMs, effectiveRate } -- Keys -local cbKey = KEYS[1] -- cb:{endpoint} circuit breaker state hash -local rlKey = KEYS[2] -- rl:{endpoint} rate limiter state hash +local epKey = KEYS[1] -- ep:{targetId} combined endpoint state hash -- Arguments -local now = tonumber(ARGV[1]) or 0 -- current wall-clock time (ms) -local capacity = tonumber(ARGV[2]) or 0 -- token bucket maximum capacity -local refillPerSec = tonumber(ARGV[3]) or 0 -- full token fill rate (tokens/sec) -local cooldownMs = tonumber(ARGV[4]) or 0 -- how long the circuit stays open (ms) -local decayPeriodMs = tonumber(ARGV[5]) or 0 -- ramp-up window after circuit closes (ms) -local cbWindowPeriodMs = tonumber(ARGV[6]) or 0 -- error-rate sliding window duration (ms) -local cbProbeIntervalMs = tonumber(ARGV[7]) or 0 -- minimum gap between probe requests (ms; 0 = no probes) - --- TTL policy: circuit breaker state must outlive the cooldown window so that --- the ramp-up period remains visible to subsequent calls after a close. --- Rate limiter state needs only a short idle window. -local cbTtlSeconds = math.ceil(cooldownMs / 1000) + 60 -local rlTtlSeconds = 120 +local now = tonumber(ARGV[1]) or 0 +local capacity = tonumber(ARGV[2]) or 0 +local targetRateLimit = tonumber(ARGV[3]) or 0 +local cooldownMs = tonumber(ARGV[4]) or 0 +local recoveryPeriodMs = tonumber(ARGV[5]) or 0 +local probeRateLimit = tonumber(ARGV[6]) or 0 +local targetBatchSize = tonumber(ARGV[7]) or 0 -------------------------------------------------------------------------------- -- LOAD STATE -------------------------------------------------------------------------------- -local cb = redis.call("HMGET", cbKey, - "opened_until_ms", "cb_window_from", "cb_failures", "cb_attempts", "last_probe_ms", - "cb_prev_failures", "cb_prev_attempts") -local openedUntil = tonumber(cb[1] or "0") -local cbWindowFrom = tonumber(cb[2] or "0") -local cbFailures = tonumber(cb[3] or "0") -local cbAttempts = tonumber(cb[4] or "0") -local lastProbeMs = tonumber(cb[5] or "0") -local cbPrevFailures = tonumber(cb[6] or "0") -local cbPrevAttempts = tonumber(cb[7] or "0") - -local rl = redis.call("HMGET", rlKey, "tokens", "last_refill_ms") -local tokens = tonumber(rl[1] or capacity) -local lastRefill = tonumber(rl[2] or now) +local state = redis.call("HMGET", epKey, + "is_open", "switched_at", "bucket_tokens", "bucket_refilled_at") +local isOpen = tonumber(state[1] or "0") == 1 +local switchedAtRaw = state[2] +local switchedAt = tonumber(switchedAtRaw or tostring(now)) +local bucketTokens = tonumber(state[3] or "0") +local bucketRefilledAt = tonumber(state[4] or "0") +local needInitSwitchedAt = switchedAtRaw == false or switchedAtRaw == nil -------------------------------------------------------------------------------- --- 1. CIRCUIT BREAKER --- --- The circuit is open when openedUntil is set and has not yet elapsed. --- All requests are rejected while open to give the endpoint time to recover. --- --- Timed probes: once per cbProbeIntervalMs a single request is allowed --- through even while the circuit is open. The caller must record the --- outcome via record-result.lua; a successful probe will close the circuit --- and trigger the ramp-up phase. +-- 1. CIRCUIT BREAKER — determine effective rate -------------------------------------------------------------------------------- -if openedUntil > 0 and now < openedUntil then - -- Allow a probe through if the probe interval has elapsed - if cbProbeIntervalMs > 0 and (now - lastProbeMs) >= cbProbeIntervalMs then - lastProbeMs = now - redis.call("HSET", cbKey, - "opened_until_ms", openedUntil, - "cb_window_from", cbWindowFrom, - "cb_failures", cbFailures, - "cb_attempts", cbAttempts, - "last_probe_ms", lastProbeMs, - "cb_prev_failures", cbPrevFailures, - "cb_prev_attempts", cbPrevAttempts - ) - redis.call("EXPIRE", cbKey, cbTtlSeconds) - return { 1, "probe", 0, 0 } - end +local isHalfOpen = isOpen and now > switchedAt + cooldownMs +local isRecovering = (not isOpen) and now < switchedAt + recoveryPeriodMs - -- Circuit is open and no probe slot is available — reject - return { 0, "circuit_open", openedUntil - now, 0 } -end - --------------------------------------------------------------------------------- --- 2. SLIDING WINDOW --- --- Two windows (current + previous) together approximate a sliding window over --- cbWindowPeriodMs. When the current window expires it is promoted to previous --- and a fresh current window starts. record-result.lua blends the two windows --- using a time-based weight to smooth the error rate across the boundary rather --- than resetting it to zero at expiry. --- --- record-result.lua is responsible for incrementing the counters; this script --- is only responsible for rolling the window boundary forward when it expires. --------------------------------------------------------------------------------- +local effectiveRate -if cbWindowFrom == 0 then - -- No window exists yet — start one now - cbWindowFrom = now -elseif (now - cbWindowFrom) > cbWindowPeriodMs then - -- Current window has expired — roll it forward - if (now - cbWindowFrom) > (2 * cbWindowPeriodMs) then - -- Both current and previous windows are stale: a long quiet period means - -- old failure counts are no longer relevant to the health of the endpoint. - cbPrevFailures = 0 - cbPrevAttempts = 0 +if isOpen then + if isHalfOpen then + effectiveRate = probeRateLimit else - -- Promote current → previous so it can be blended with the new current window - cbPrevFailures = cbFailures - cbPrevAttempts = cbAttempts + return { 0, "circuit_open", (switchedAt + cooldownMs) - now, 0 } + end +else + if isRecovering then + effectiveRate = targetRateLimit * (now - switchedAt) / recoveryPeriodMs + else + effectiveRate = targetRateLimit end - cbFailures = 0 - cbAttempts = 0 - cbWindowFrom = now end -------------------------------------------------------------------------------- --- 3. TOKEN BUCKET +-- 2. TOKEN BUCKET — batch consumption -- --- Refills tokens based on elapsed time, then tries to consume one. --- If no tokens are available the request is rate-limited. +-- Generate tokens based on elapsed time, then consume as many as needed for +-- the batch, up to the number available. -- --- Ramp-up: after the circuit closes (openedUntil is set but in the past), --- effectiveRate scales linearly from near-zero to the full refillPerSec over --- decayPeriodMs. This deliberately slows recovery traffic so a flapping --- endpoint is not immediately overwhelmed. --- Once decayPeriodMs elapses, openedUntil is cleared and the full rate resumes. +-- Refill precision: bucketRefilledAt advances by exactly the time required to +-- generate the whole tokens (not set to `now`), preserving fractional time. -------------------------------------------------------------------------------- -local effectiveRate = refillPerSec - -if openedUntil > 0 and now > openedUntil and decayPeriodMs > 0 then - -- Circuit has recently closed — apply linear ramp-up - local sinceClose = now - openedUntil - if sinceClose >= decayPeriodMs then - -- Decay period fully elapsed — restore full rate and clear the CB timestamp - openedUntil = 0 - else - -- Still within decay period — scale fill rate proportionally to time elapsed - local fraction = sinceClose / decayPeriodMs - effectiveRate = math.max(1, math.floor(refillPerSec * fraction)) - end -end - --- Refill tokens based on time elapsed since last refill -local elapsed = now - lastRefill -if elapsed > 0 then - local refill = math.floor((elapsed * effectiveRate) / 1000) - if refill > 0 then - tokens = math.min(capacity, tokens + refill) - lastRefill = now - end -end +local generatedTokens = math.floor((now - bucketRefilledAt) * effectiveRate / 1000) +local availTokens = math.min(capacity, bucketTokens + generatedTokens) +local consumedTokens = math.min(targetBatchSize, availTokens) --- Not enough tokens — rate-limited --- TTL is intentionally not refreshed here; it was set on the last allowed call. -if tokens < 1 then - redis.call("HSET", cbKey, - "opened_until_ms", openedUntil, - "cb_window_from", cbWindowFrom, - "cb_failures", cbFailures, - "cb_attempts", cbAttempts, - "cb_prev_failures", cbPrevFailures, - "cb_prev_attempts", cbPrevAttempts - ) - redis.call("HSET", rlKey, - "tokens", tokens, - "last_refill_ms", lastRefill - ) - return { 0, "rate_limited", 1000, effectiveRate } +bucketTokens = availTokens - consumedTokens +if generatedTokens > 0 and effectiveRate > 0 then + local generationTime = generatedTokens * 1000 / effectiveRate + bucketRefilledAt = bucketRefilledAt + generationTime end --- Consume one token -tokens = tokens - 1 - -------------------------------------------------------------------------------- --- 4. PERSIST STATE AND ALLOW +-- 3. PERSIST STATE AND RETURN -------------------------------------------------------------------------------- -redis.call("HSET", cbKey, - "opened_until_ms", openedUntil, - "cb_window_from", cbWindowFrom, - "cb_failures", cbFailures, - "cb_attempts", cbAttempts, - "cb_prev_failures", cbPrevFailures, - "cb_prev_attempts", cbPrevAttempts -) -redis.call("HSET", rlKey, - "tokens", tokens, - "last_refill_ms", lastRefill +redis.call("HSET", epKey, + "bucket_tokens", bucketTokens, + "bucket_refilled_at", bucketRefilledAt ) -redis.call("EXPIRE", cbKey, cbTtlSeconds) -redis.call("EXPIRE", rlKey, rlTtlSeconds) +if needInitSwitchedAt then + redis.call("HSET", epKey, "switched_at", switchedAt) +end -return { 1, "allowed", 0, effectiveRate } +local reason = consumedTokens < 1 and "rate_limited" or "allowed" +local retryAfter = consumedTokens < 1 and 1000 or 0 +return { consumedTokens, reason, retryAfter, effectiveRate } diff --git a/lambdas/https-client-lambda/src/services/endpoint-gate.ts b/lambdas/https-client-lambda/src/services/endpoint-gate.ts index c2d85439..bf9c1462 100644 --- a/lambdas/https-client-lambda/src/services/endpoint-gate.ts +++ b/lambdas/https-client-lambda/src/services/endpoint-gate.ts @@ -5,7 +5,7 @@ import recordResultLuaSrc from "services/record-result.lua"; export type AdmitResultAllowed = { allowed: true; - probe: boolean; + consumedTokens: number; effectiveRate: number; }; @@ -24,12 +24,12 @@ export type RecordResultOutcome = export type EndpointGateConfig = { burstCapacity: number; - cbProbeIntervalMs: number; - decayPeriodMs: number; - cbWindowPeriodMs: number; - cbErrorThreshold: number; - cbMinAttempts: number; - cbCooldownMs: number; + probeRateLimit: number; + recoveryPeriodMs: number; + samplePeriodMs: number; + failureThreshold: number; + minAttempts: number; + cooldownPeriodMs: number; }; let admitSha: string | undefined; @@ -76,22 +76,21 @@ export async function admit( targetId: string, refillPerSec: number, cbEnabled: boolean, + targetBatchSize: number, config: EndpointGateConfig, ): Promise { - const cbKey = `cb:{${targetId}}`; - const rlKey = `rl:{${targetId}}`; + const epKey = `ep:{${targetId}}`; const now = Date.now().toString(); - const probeIntervalMs = cbEnabled ? config.cbProbeIntervalMs.toString() : "0"; + const probeRate = cbEnabled ? config.probeRateLimit.toString() : "0"; const args = [ now, config.burstCapacity.toString(), - // eslint-disable-next-line sonarjs/null-dereference - refillPerSec.toString(), - config.cbCooldownMs.toString(), - config.decayPeriodMs.toString(), - config.cbWindowPeriodMs.toString(), - probeIntervalMs, + String(refillPerSec), + config.cooldownPeriodMs.toString(), + config.recoveryPeriodMs.toString(), + probeRate, + String(targetBatchSize), ]; if (!admitSha) { @@ -102,16 +101,16 @@ export async function admit( client, admitLuaSrc, admitSha, - [cbKey, rlKey], + [epKey], args, )) as [number, string, number, number]; - const [allowed, reason, retryAfterMs, effectiveRate] = raw; + const [consumedOrFlag, reason, retryAfterMs, effectiveRate] = raw; - if (allowed === 1) { + if (reason === "allowed" || reason === "probe") { return { allowed: true, - probe: reason === "probe", + consumedTokens: Number(consumedOrFlag), effectiveRate: Number(effectiveRate), }; } @@ -127,20 +126,22 @@ export async function admit( export async function recordResult( client: RedisClientType, targetId: string, - success: boolean, + consumedTokens: number, + processingFailures: number, config: EndpointGateConfig, ): Promise { - const cbKey = `cb:{${targetId}}`; + const epKey = `ep:{${targetId}}`; const now = Date.now().toString(); const args = [ now, - success ? "1" : "0", - config.cbCooldownMs.toString(), - config.decayPeriodMs.toString(), - config.cbErrorThreshold.toString(), - config.cbMinAttempts.toString(), - config.cbWindowPeriodMs.toString(), + String(consumedTokens), + String(processingFailures), + config.cooldownPeriodMs.toString(), + config.recoveryPeriodMs.toString(), + config.failureThreshold.toString(), + config.minAttempts.toString(), + config.samplePeriodMs.toString(), ]; if (!recordResultSha) { @@ -151,7 +152,7 @@ export async function recordResult( client, recordResultLuaSrc, recordResultSha, - [cbKey], + [epKey], args, )) as [number, string]; diff --git a/lambdas/https-client-lambda/src/services/record-result.lua b/lambdas/https-client-lambda/src/services/record-result.lua index 1cc94857..fa3b1b12 100644 --- a/lambdas/https-client-lambda/src/services/record-result.lua +++ b/lambdas/https-client-lambda/src/services/record-result.lua @@ -1,144 +1,150 @@ --- record-result.lua — Records the outcome of a delivery attempt. +-- record-result.lua — Post-processing: updates sampling and circuit breaker. -- --- Updates the circuit breaker's error-rate window counters and opens the --- circuit if the failure rate exceeds the configured threshold. --- --- On success: --- Window counters are left intact. The openedUntil timestamp is preserved --- while the decay period is still active so that admit.lua can continue --- computing the linear ramp-up rate. Once the decay period elapses it --- is zeroed, returning the circuit to a fully clean closed state. --- --- On failure: --- The failure and attempt counters are incremented. A two-window sliding --- blend is computed before evaluating the trip condition: --- slidingAttempts = cbAttempts + cbPrevAttempts * prevWeight --- slidingFailures = cbFailures + cbPrevFailures * prevWeight --- where prevWeight decays linearly from 1.0 → 0.0 as the current window ages, --- so previous-window failures fade out gradually rather than dropping off a cliff. --- The circuit opens when: --- • the endpoint is not already open (prevents double-tripping and --- resetting the cooldown timer prematurely), AND --- • slidingAttempts >= cbMinAttempts (avoids tripping on statistically --- insignificant data at cold start or just after a window roll), AND --- • slidingFailures / slidingAttempts exceeds cbErrorThreshold. --- On open, all counters (current and previous) are reset to zero so the --- fresh cooldown window begins with a clean slate ready for recovery. +-- After processing a batch, this script: +-- 1. Manages the sliding window (rolling forward as necessary) +-- 2. Records new attempts and failures (unless fully open) +-- 3. Interpolates attempt/failure rates using the sliding window +-- 4. Checks whether to close the circuit (half-open + successes) +-- 5. Checks whether to open the circuit (closed + threshold exceeded) -- -- Returns: { ok (0|1), state } -- state: "closed" | "opened" | "failed" +-- Return state constants +local OPENED = "opened" +local CLOSED = "closed" +local FAILED = "failed" + -- Keys -local cbKey = KEYS[1] -- cb:{endpoint} circuit breaker state hash +local epKey = KEYS[1] -- ep:{targetId} combined endpoint state hash -- Arguments -local now = tonumber(ARGV[1]) or 0 -- current wall-clock time (ms) -local success = tonumber(ARGV[2]) or 0 -- 1 = success, 0 = failure -local cooldownMs = tonumber(ARGV[3]) or 0 -- how long the circuit stays open (ms) -local decayPeriodMs = tonumber(ARGV[4]) or 0 -- ramp-up window after circuit closes (ms) -local cbErrorThreshold = tonumber(ARGV[5]) or 0 -- error-rate fraction that trips the circuit (e.g. 0.5) -local cbMinAttempts = tonumber(ARGV[6]) or 0 -- minimum samples before the circuit can trip -local cbWindowPeriodMs = tonumber(ARGV[7]) or 0 -- error-rate sliding window duration (ms) - --- TTL policy: keep circuit breaker state alive for at least the cooldown --- duration plus a buffer so the decay period remains visible after a close. -local cbTtlSeconds = math.ceil(cooldownMs / 1000) + 60 - -local function refreshCbExpiry() - redis.call("EXPIRE", cbKey, cbTtlSeconds) -end +local now = tonumber(ARGV[1]) or 0 +local consumedTokens = tonumber(ARGV[2]) or 0 +local processingFailures = tonumber(ARGV[3]) or 0 +local cooldownPeriodMs = tonumber(ARGV[4]) or 0 +local _recoveryPeriodMs = tonumber(ARGV[5]) or 0 -- luacheck: ignore +local failureThreshold = tonumber(ARGV[6]) or 0 +local minAttempts = tonumber(ARGV[7]) or 0 +local samplePeriodMs = tonumber(ARGV[8]) or 0 -------------------------------------------------------------------------------- -- LOAD CURRENT STATE -------------------------------------------------------------------------------- -local cb = redis.call("HMGET", cbKey, - "opened_until_ms", "cb_window_from", "cb_failures", "cb_attempts", - "cb_prev_failures", "cb_prev_attempts") -local openedUntil = tonumber(cb[1] or "0") -local cbWindowFrom = tonumber(cb[2] or "0") -local cbFailures = tonumber(cb[3] or "0") -local cbAttempts = tonumber(cb[4] or "0") -local cbPrevFailures = tonumber(cb[5] or "0") -local cbPrevAttempts = tonumber(cb[6] or "0") +local state = redis.call("HMGET", epKey, + "is_open", "switched_at", + "cur_attempts", "prev_attempts", "cur_failures", "prev_failures", + "sample_till") +local isOpen = tonumber(state[1] or "0") == 1 +local switchedAt = tonumber(state[2] or tostring(now)) +local curAttempts = tonumber(state[3] or "0") +local prevAttempts = tonumber(state[4] or "0") +local curFailures = tonumber(state[5] or "0") +local prevFailures = tonumber(state[6] or "0") +local sampleTill = tonumber(state[7] or "0") --- Every outcome (success or failure) contributes to the error-rate window -cbAttempts = cbAttempts + 1 +-------------------------------------------------------------------------------- +-- 1. DETERMINE CIRCUIT SUB-STATE +-------------------------------------------------------------------------------- + +local isHalfOpen = isOpen and now > switchedAt + cooldownPeriodMs +local isFullyOpen = isOpen and not isHalfOpen -------------------------------------------------------------------------------- --- SUCCESS — preserve openedUntil during decay, then zero it --- --- admit.lua uses openedUntil to calculate the linear ramp-up rate while the --- decay period is active. That timestamp must survive in Redis until the --- decay period ends. Clearing it prematurely would snap the fill rate back --- to full immediately rather than ramping gradually. +-- 2. MANAGE SLIDING WINDOW -------------------------------------------------------------------------------- -if success == 1 then - -- Keep openedUntil only if we are still within the decay window - local inDecayWindow = openedUntil > 0 and now > openedUntil and (now - openedUntil) < decayPeriodMs - local preservedOpenedUntil = inDecayWindow and openedUntil or 0 - - redis.call("HSET", cbKey, - "opened_until_ms", preservedOpenedUntil, - "cb_window_from", cbWindowFrom, - "cb_failures", cbFailures, - "cb_attempts", cbAttempts, - "cb_prev_failures", cbPrevFailures, - "cb_prev_attempts", cbPrevAttempts - ) - refreshCbExpiry() - return { 1, "closed" } +if sampleTill < now then + if sampleTill + samplePeriodMs < now then + -- Complete reset — window is too old + prevAttempts = 0 + prevFailures = 0 + sampleTill = now + samplePeriodMs + else + -- Promote current to previous + prevAttempts = curAttempts + prevFailures = curFailures + sampleTill = sampleTill + samplePeriodMs + end + curAttempts = 0 + curFailures = 0 end -------------------------------------------------------------------------------- --- FAILURE — increment counter and evaluate whether to open the circuit --- --- The trip condition is evaluated against a sliding blend of current and --- previous window counts, not the raw current-window counts alone. This --- prevents a burst of failures from escaping detection simply because it --- straddles a window boundary and gets partially discarded by a reset. +-- 3. RECORD NEW ATTEMPTS/FAILURES (unless fully open) -------------------------------------------------------------------------------- -cbFailures = cbFailures + 1 - --- The circuit is already open when openedUntil is set and has not yet elapsed. --- Guard against double-tripping, which would reset the cooldown timer early. -local circuitAlreadyOpen = openedUntil > 0 and now < openedUntil - --- Blend current and previous window counts. --- prevWeight decays linearly from 1.0 → 0.0 as the current window ages, --- so previous-window failures fade out gradually rather than dropping off a cliff. -local windowElapsed = cbWindowFrom > 0 and (now - cbWindowFrom) or 0 -local hasWindow = cbWindowPeriodMs > 0 -local prevWeight = hasWindow and math.max(0, (cbWindowPeriodMs - windowElapsed) / cbWindowPeriodMs) or 0 -local slidingFailures = cbFailures + cbPrevFailures * prevWeight -local slidingAttempts = cbAttempts + cbPrevAttempts * prevWeight - -if not circuitAlreadyOpen - and slidingAttempts >= cbMinAttempts -- enough data to be statistically meaningful - and (slidingFailures / slidingAttempts) > cbErrorThreshold then - -- Trip the circuit — reset all counters so recovery starts from a clean slate - redis.call("HSET", cbKey, - "opened_until_ms", now + cooldownMs, - "cb_window_from", 0, - "cb_failures", 0, - "cb_attempts", 0, - "cb_prev_failures", 0, - "cb_prev_attempts", 0 - ) - refreshCbExpiry() - return { 0, "opened" } +if not isFullyOpen then + curAttempts = curAttempts + consumedTokens + curFailures = curFailures + processingFailures +end + +-------------------------------------------------------------------------------- +-- 4. INTERPOLATE VALUES +-------------------------------------------------------------------------------- + +local weight = (sampleTill - now) / samplePeriodMs +local attempts = prevAttempts * weight + curAttempts +local failures = prevFailures * weight + curFailures + +-------------------------------------------------------------------------------- +-- 5. CIRCUIT BREAKER LOGIC +-------------------------------------------------------------------------------- + +local processingSuccesses = consumedTokens - processingFailures +local stateChanged = false + +-- Close circuit when half-open and there are successes +if isHalfOpen and processingSuccesses > 0 then + isOpen = false + switchedAt = now + stateChanged = true + -- fall through, allow circuit to immediately re-open +end + +-- Open circuit when closed, enough samples, and threshold exceeded +local hasSampledEnough = attempts >= minAttempts +if not isOpen and hasSampledEnough and (failures / attempts) > failureThreshold then + isOpen = true + switchedAt = now + curAttempts = 0 + curFailures = 0 + prevAttempts = 0 + prevFailures = 0 + sampleTill = now + samplePeriodMs + stateChanged = true end --- Below the threshold — record the failure but keep the circuit closed -redis.call("HSET", cbKey, - "opened_until_ms", openedUntil, - "cb_window_from", cbWindowFrom, - "cb_failures", cbFailures, - "cb_attempts", cbAttempts, - "cb_prev_failures", cbPrevFailures, - "cb_prev_attempts", cbPrevAttempts +-------------------------------------------------------------------------------- +-- 6. PERSIST STATE +-------------------------------------------------------------------------------- + +redis.call("HSET", epKey, + "cur_attempts", curAttempts, + "prev_attempts", prevAttempts, + "cur_failures", curFailures, + "prev_failures", prevFailures, + "sample_till", sampleTill ) -refreshCbExpiry() -return { 0, "failed" } + +if stateChanged then + redis.call("HSET", epKey, + "is_open", isOpen and 1 or 0, + "switched_at", switchedAt + ) +end + +if stateChanged and isOpen then + return { 0, OPENED } +end + +if stateChanged and not isOpen then + return { 1, CLOSED } +end + +if isOpen or processingFailures > 0 then + return { 0, FAILED } +end + +return { 1, CLOSED } diff --git a/lambdas/perf-runner-lambda/package.json b/lambdas/perf-runner-lambda/package.json index 9f9d01d8..59d7691b 100644 --- a/lambdas/perf-runner-lambda/package.json +++ b/lambdas/perf-runner-lambda/package.json @@ -13,13 +13,17 @@ "typecheck": "tsc --noEmit" }, "dependencies": { + "@aws-crypto/sha256-js": "catalog:aws", "@aws-sdk/client-cloudwatch-logs": "catalog:aws", "@aws-sdk/client-sqs": "catalog:aws", + "@aws-sdk/credential-providers": "catalog:aws", + "@smithy/signature-v4": "catalog:aws", "@nhs-notify-client-callbacks/logger": "workspace:*", "@nhs-notify-client-callbacks/models": "workspace:*", - "esbuild": "catalog:tools" + "@redis/client": "catalog:app" }, "devDependencies": { + "esbuild": "catalog:tools", "@tsconfig/node22": "catalog:tools", "@types/aws-lambda": "catalog:tools", "@types/jest": "catalog:test", diff --git a/lambdas/perf-runner-lambda/src/__tests__/cloudwatch.test.ts b/lambdas/perf-runner-lambda/src/__tests__/cloudwatch.test.ts index 055ac7bc..526de638 100644 --- a/lambdas/perf-runner-lambda/src/__tests__/cloudwatch.test.ts +++ b/lambdas/perf-runner-lambda/src/__tests__/cloudwatch.test.ts @@ -1,5 +1,10 @@ import type { CloudWatchLogsClient } from "@aws-sdk/client-cloudwatch-logs"; -import { queryDeliveryMetricsSnapshot, queryMetricsSnapshot } from "cloudwatch"; +import { + queryCircuitBreakerSnapshot, + queryDeliveryMetricsSnapshot, + queryMetricsSnapshot, + queryPerClientRateTimeline, +} from "cloudwatch"; const mockCloudWatchClient = { send: jest.fn(), @@ -285,3 +290,373 @@ describe("queryDeliveryMetricsSnapshot", () => { expect(result).toBeNull(); }); }); + +describe("queryCircuitBreakerSnapshot", () => { + it("returns null when logGroupNames is empty", async () => { + const result = await queryCircuitBreakerSnapshot( + mockCloudWatchClient, + [], + 0, + 60, + ); + + expect(result).toBeNull(); + expect(mockCloudWatchClient.send).not.toHaveBeenCalled(); + }); + + it("returns null when StartQuery returns no queryId", async () => { + mockCloudWatchClient.send.mockResolvedValueOnce({} as never); + + const result = await queryCircuitBreakerSnapshot( + mockCloudWatchClient, + ["/aws/lambda/test-https-client-perf-client-1"], + 0, + 60, + ); + + expect(result).toBeNull(); + }); + + it("returns a snapshot with zeroed metrics when the result row is empty", async () => { + mockCloudWatchClient.send + .mockResolvedValueOnce({ queryId: "qid-cb1" } as never) + .mockResolvedValueOnce({ status: "Complete", results: [] } as never); + + const promise = queryCircuitBreakerSnapshot( + mockCloudWatchClient, + ["/aws/lambda/test-https-client-perf-client-1"], + 100, + 160, + ); + + await jest.runAllTimersAsync(); + const result = await promise; + + expect(result).toMatchObject({ + intervalStartSec: 100, + intervalEndSec: 160, + circuitOpenEvents: 0, + circuitCloseEvents: 0, + admissionDeniedCircuitOpen: 0, + admissionDeniedRateLimited: 0, + deliveryAttempts: 0, + deliverySuccesses: 0, + deliveryFailures: 0, + deliveryRateLimited: 0, + }); + expect(result?.snapshotAt).toBeGreaterThan(0); + }); + + it("returns a populated snapshot when query completes successfully", async () => { + mockCloudWatchClient.send + .mockResolvedValueOnce({ queryId: "qid-cb2" } as never) + .mockResolvedValueOnce({ + status: "Complete", + results: [ + [ + { field: "circuitOpenEvents", value: "3" }, + { field: "circuitCloseEvents", value: "2" }, + { field: "admissionDeniedCircuitOpen", value: "15" }, + { field: "admissionDeniedRateLimited", value: "8" }, + { field: "deliveryAttempts", value: "200" }, + { field: "deliverySuccesses", value: "180" }, + { field: "deliveryFailures", value: "12" }, + { field: "deliveryRateLimited", value: "8" }, + ], + ], + } as never); + + const promise = queryCircuitBreakerSnapshot( + mockCloudWatchClient, + ["/aws/lambda/test-https-client-perf-client-1"], + 100, + 160, + ); + + await jest.runAllTimersAsync(); + const result = await promise; + + expect(result).toMatchObject({ + intervalStartSec: 100, + intervalEndSec: 160, + circuitOpenEvents: 3, + circuitCloseEvents: 2, + admissionDeniedCircuitOpen: 15, + admissionDeniedRateLimited: 8, + deliveryAttempts: 200, + deliverySuccesses: 180, + deliveryFailures: 12, + deliveryRateLimited: 8, + }); + }); + + it("sends logGroupNames to StartQuery", async () => { + mockCloudWatchClient.send + .mockResolvedValueOnce({ queryId: "qid-cb3" } as never) + .mockResolvedValueOnce({ status: "Complete", results: [] } as never); + + const logGroups = [ + "/aws/lambda/test-https-client-perf-client-1", + "/aws/lambda/test-https-client-perf-client-2", + ]; + + const promise = queryCircuitBreakerSnapshot( + mockCloudWatchClient, + logGroups, + 0, + 60, + ); + + await jest.runAllTimersAsync(); + await promise; + + const startCmd = mockCloudWatchClient.send.mock.calls[0][0] as { + input: { logGroupNames: string[] }; + }; + expect(startCmd.input.logGroupNames).toEqual(logGroups); + }); + + it("returns null when the query status is Failed", async () => { + mockCloudWatchClient.send + .mockResolvedValueOnce({ queryId: "qid-cb4" } as never) + .mockResolvedValueOnce({ status: "Failed" } as never); + + const promise = queryCircuitBreakerSnapshot( + mockCloudWatchClient, + ["/aws/lambda/test-https-client-perf-client-1"], + 0, + 60, + ); + + await jest.runAllTimersAsync(); + const result = await promise; + + expect(result).toBeNull(); + }); +}); + +describe("queryPerClientRateTimeline", () => { + it("returns empty array when StartQuery returns no queryId", async () => { + mockCloudWatchClient.send.mockResolvedValueOnce({} as never); + + const result = await queryPerClientRateTimeline( + mockCloudWatchClient, + "/aws/lambda/test-https-client-perf-client-1", + 0, + 60, + ); + + expect(result).toEqual([]); + }); + + it("returns empty array when the query status is Failed", async () => { + mockCloudWatchClient.send + .mockResolvedValueOnce({ queryId: "qid-pcr1" } as never) + .mockResolvedValueOnce({ status: "Failed" } as never); + + const promise = queryPerClientRateTimeline( + mockCloudWatchClient, + "/aws/lambda/test-https-client-perf-client-1", + 0, + 60, + ); + + await jest.runAllTimersAsync(); + const result = await promise; + + expect(result).toEqual([]); + }); + + it("returns empty array when results are empty", async () => { + mockCloudWatchClient.send + .mockResolvedValueOnce({ queryId: "qid-pcr2" } as never) + .mockResolvedValueOnce({ status: "Complete", results: [] } as never); + + const promise = queryPerClientRateTimeline( + mockCloudWatchClient, + "/aws/lambda/test-https-client-perf-client-1", + 0, + 60, + ); + + await jest.runAllTimersAsync(); + const result = await promise; + + expect(result).toEqual([]); + }); + + it("returns empty array when results is undefined", async () => { + mockCloudWatchClient.send + .mockResolvedValueOnce({ queryId: "qid-pcr2b" } as never) + .mockResolvedValueOnce({ status: "Complete" } as never); + + const promise = queryPerClientRateTimeline( + mockCloudWatchClient, + "/aws/lambda/test-https-client-perf-client-1", + 0, + 60, + ); + + await jest.runAllTimersAsync(); + const result = await promise; + + expect(result).toEqual([]); + }); + + it("defaults missing fields to zero", async () => { + mockCloudWatchClient.send + .mockResolvedValueOnce({ queryId: "qid-pcr2c" } as never) + .mockResolvedValueOnce({ + status: "Complete", + results: [[{ field: "unknownField", value: "123" }]], + } as never); + + const promise = queryPerClientRateTimeline( + mockCloudWatchClient, + "/aws/lambda/test-https-client-perf-client-1", + 0, + 60, + ); + + await jest.runAllTimersAsync(); + const result = await promise; + + expect(result).toHaveLength(1); + expect(result[0].deliveryAttempts).toBe(0); + expect(result[0].timestampSec).toBe( + Math.floor(new Date("0").getTime() / 1000), + ); + }); + + it("returns entries sorted by time bin when query completes", async () => { + mockCloudWatchClient.send + .mockResolvedValueOnce({ queryId: "qid-pcr3" } as never) + .mockResolvedValueOnce({ + status: "Complete", + results: [ + [ + { field: "timeBin", value: "2026-04-09 10:00:00.000" }, + { field: "deliveryAttempts", value: "42" }, + ], + [ + { field: "timeBin", value: "2026-04-09 10:00:10.000" }, + { field: "deliveryAttempts", value: "38" }, + ], + ], + } as never); + + const promise = queryPerClientRateTimeline( + mockCloudWatchClient, + "/aws/lambda/test-https-client-perf-client-1", + 0, + 60, + ); + + await jest.runAllTimersAsync(); + const result = await promise; + + expect(result).toHaveLength(2); + expect(result[0]).toEqual({ + timestampSec: Math.floor( + new Date("2026-04-09 10:00:00.000").getTime() / 1000, + ), + deliveryAttempts: 42, + }); + expect(result[1]).toEqual({ + timestampSec: Math.floor( + new Date("2026-04-09 10:00:10.000").getTime() / 1000, + ), + deliveryAttempts: 38, + }); + }); + + it("sends logGroupName to StartQuery", async () => { + mockCloudWatchClient.send + .mockResolvedValueOnce({ queryId: "qid-pcr4" } as never) + .mockResolvedValueOnce({ status: "Complete", results: [] } as never); + + const promise = queryPerClientRateTimeline( + mockCloudWatchClient, + "/aws/lambda/test-https-client-perf-client-1", + 100, + 200, + ); + + await jest.runAllTimersAsync(); + await promise; + + const startCmd = mockCloudWatchClient.send.mock.calls[0][0] as { + input: { logGroupName: string; startTime: number; endTime: number }; + }; + expect(startCmd.input.logGroupName).toBe( + "/aws/lambda/test-https-client-perf-client-1", + ); + expect(startCmd.input.startTime).toBe(100); + expect(startCmd.input.endTime).toBe(200); + }); + + it("polls until the query becomes Complete", async () => { + mockCloudWatchClient.send + .mockResolvedValueOnce({ queryId: "qid-pcr5" } as never) + .mockResolvedValueOnce({ status: "Running" } as never) + .mockResolvedValueOnce({ + status: "Complete", + results: [ + [ + { field: "timeBin", value: "2026-04-09 10:00:00.000" }, + { field: "deliveryAttempts", value: "5" }, + ], + ], + } as never); + + const promise = queryPerClientRateTimeline( + mockCloudWatchClient, + "/aws/lambda/test-https-client-perf-client-1", + 0, + 60, + ); + + await jest.runAllTimersAsync(); + const result = await promise; + + expect(result).toHaveLength(1); + expect(result[0].deliveryAttempts).toBe(5); + expect(mockCloudWatchClient.send).toHaveBeenCalledTimes(3); + }); + + it("returns empty array when the query does not complete within the timeout", async () => { + mockCloudWatchClient.send + .mockResolvedValueOnce({ queryId: "qid-pcr6" } as never) + .mockResolvedValue({ status: "Running" } as never); + + const promise = queryPerClientRateTimeline( + mockCloudWatchClient, + "/aws/lambda/test-https-client-perf-client-1", + 0, + 60, + ); + + await jest.advanceTimersByTimeAsync(60_000); + const result = await promise; + + expect(result).toEqual([]); + }); + + it("returns empty array when the query status is Cancelled", async () => { + mockCloudWatchClient.send + .mockResolvedValueOnce({ queryId: "qid-pcr7" } as never) + .mockResolvedValueOnce({ status: "Cancelled" } as never); + + const promise = queryPerClientRateTimeline( + mockCloudWatchClient, + "/aws/lambda/test-https-client-perf-client-1", + 0, + 60, + ); + + await jest.runAllTimersAsync(); + const result = await promise; + + expect(result).toEqual([]); + }); +}); diff --git a/lambdas/perf-runner-lambda/src/__tests__/elasticache.test.ts b/lambdas/perf-runner-lambda/src/__tests__/elasticache.test.ts new file mode 100644 index 00000000..09846ed3 --- /dev/null +++ b/lambdas/perf-runner-lambda/src/__tests__/elasticache.test.ts @@ -0,0 +1,74 @@ +import { flushElastiCache } from "elasticache"; +import type { ElastiCacheDeps } from "types"; + +const mockConnect = jest.fn().mockResolvedValue(undefined); +const mockFlushAll = jest.fn().mockResolvedValue("OK"); +const mockDisconnect = jest.fn().mockResolvedValue(undefined); +let mockIsOpen = true; + +jest.mock("@redis/client", () => ({ + createClient: jest.fn(() => ({ + connect: mockConnect, + flushAll: mockFlushAll, + disconnect: mockDisconnect, + get isOpen() { + return mockIsOpen; + }, + })), +})); + +jest.mock("@smithy/signature-v4", () => ({ + SignatureV4: jest.fn(() => ({ + presign: jest.fn().mockResolvedValue({ + query: { + "X-Amz-Algorithm": "AWS4-HMAC-SHA256", + "X-Amz-Credential": "test-credential", + }, + }), + })), +})); + +jest.mock("@aws-crypto/sha256-js", () => ({ + Sha256: jest.fn(), +})); + +jest.mock("@aws-sdk/credential-providers", () => ({ + fromNodeProviderChain: jest.fn(() => ({})), +})); + +const deps: ElastiCacheDeps = { + endpoint: "test-cache.example.invalid", + cacheName: "test-cache", + iamUsername: "test-user", + region: "eu-west-2", +}; + +beforeEach(() => { + jest.clearAllMocks(); + mockIsOpen = true; +}); + +describe("flushElastiCache", () => { + it("connects, flushes all keys, and disconnects", async () => { + await flushElastiCache(deps); + + expect(mockConnect).toHaveBeenCalledTimes(1); + expect(mockFlushAll).toHaveBeenCalledTimes(1); + expect(mockDisconnect).toHaveBeenCalledTimes(1); + }); + + it("disconnects even when flushAll throws", async () => { + mockFlushAll.mockRejectedValueOnce(new Error("flush failed")); + + await expect(flushElastiCache(deps)).rejects.toThrow("flush failed"); + expect(mockDisconnect).toHaveBeenCalledTimes(1); + }); + + it("skips disconnect when client is not open", async () => { + mockIsOpen = false; + + await flushElastiCache(deps); + + expect(mockDisconnect).not.toHaveBeenCalled(); + }); +}); diff --git a/lambdas/perf-runner-lambda/src/__tests__/index.test.ts b/lambdas/perf-runner-lambda/src/__tests__/index.test.ts index 1d1a501a..3c33bfd6 100644 --- a/lambdas/perf-runner-lambda/src/__tests__/index.test.ts +++ b/lambdas/perf-runner-lambda/src/__tests__/index.test.ts @@ -32,6 +32,7 @@ const mockResult: PerformanceResult = { phases: [], metrics: [], deliveryMetrics: [], + circuitBreakerMetrics: [], }; beforeEach(() => { @@ -42,6 +43,11 @@ beforeEach(() => { "/aws/lambda/nhs-dev-callbacks-client-transform-filter"; process.env.DELIVERY_LOG_GROUP_PREFIX = "/aws/lambda/nhs-dev-callbacks-https-client-"; + process.env.MOCK_WEBHOOK_LOG_GROUP = + "/aws/lambda/nhs-dev-callbacks-mock-webhook"; + process.env.ELASTICACHE_ENDPOINT = "cache.example.invalid"; + process.env.ELASTICACHE_CACHE_NAME = "test-cache"; + process.env.ELASTICACHE_IAM_USERNAME = "test-user"; process.env.AWS_REGION = "eu-west-2"; }); @@ -55,9 +61,17 @@ describe("handler", () => { queueUrl: "https://sqs.example.invalid/queue", logGroupName: "/aws/lambda/nhs-dev-callbacks-client-transform-filter", deliveryLogGroupPrefix: "/aws/lambda/nhs-dev-callbacks-https-client-", + mockWebhookLogGroup: "/aws/lambda/nhs-dev-callbacks-mock-webhook", }), DEFAULT_SCENARIO, "test-id", + undefined, + expect.objectContaining({ + endpoint: "cache.example.invalid", + cacheName: "test-cache", + iamUsername: "test-user", + region: "eu-west-2", + }), ); }); @@ -73,6 +87,8 @@ describe("handler", () => { expect.anything(), customScenario, "custom-test", + undefined, + expect.anything(), ); }); @@ -117,6 +133,38 @@ describe("handler", () => { }), DEFAULT_SCENARIO, "no-prefix-test", + undefined, + expect.anything(), + ); + }); + + it("passes undefined elastiCacheDeps when ElastiCache env vars are missing", async () => { + delete process.env.ELASTICACHE_ENDPOINT; + delete process.env.ELASTICACHE_CACHE_NAME; + delete process.env.ELASTICACHE_IAM_USERNAME; + + await handler({ testId: "no-cache-test" }); + + expect(mockRunPerformanceTest).toHaveBeenCalledWith( + expect.anything(), + DEFAULT_SCENARIO, + "no-cache-test", + undefined, + undefined, + ); + }); + + it("passes mockWebhookLogGroup from env var", async () => { + await handler({ testId: "webhook-test" }); + + expect(mockRunPerformanceTest).toHaveBeenCalledWith( + expect.objectContaining({ + mockWebhookLogGroup: "/aws/lambda/nhs-dev-callbacks-mock-webhook", + }), + expect.anything(), + "webhook-test", + undefined, + expect.anything(), ); }); }); diff --git a/lambdas/perf-runner-lambda/src/__tests__/purge.test.ts b/lambdas/perf-runner-lambda/src/__tests__/purge.test.ts new file mode 100644 index 00000000..60347ef9 --- /dev/null +++ b/lambdas/perf-runner-lambda/src/__tests__/purge.test.ts @@ -0,0 +1,116 @@ +import type { SQSClient } from "@aws-sdk/client-sqs"; +import { deriveQueueUrls, purgeQueues } from "purge"; +import type { Scenario } from "types"; + +const scenario: Scenario = { + phases: [{ durationSecs: 1, targetEps: 10 }], + eventMix: [ + { + weight: 1, + factory: "messageStatus", + clientId: "perf-client-1", + messageStatus: "DELIVERED", + }, + { + weight: 1, + factory: "channelStatus", + clientId: "perf-client-2", + channelStatus: "DELIVERED", + }, + ], + metricsIntervalSecs: 5, +}; + +const inboundQueueUrl = + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-inbound-event-queue"; + +describe("deriveQueueUrls", () => { + it("derives all queue URLs from the inbound queue URL and scenario", () => { + const urls = deriveQueueUrls(inboundQueueUrl, scenario); + + expect(urls).toEqual([ + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-inbound-event-queue", + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-inbound-event-dlq-queue", + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-perf-client-1-delivery-queue", + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-perf-client-1-delivery-dlq-queue", + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-perf-client-2-delivery-queue", + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-perf-client-2-delivery-dlq-queue", + ]); + }); + + it("deduplicates client IDs that appear multiple times in eventMix", () => { + const duplicateScenario: Scenario = { + ...scenario, + eventMix: [ + { + weight: 1, + factory: "messageStatus", + clientId: "perf-client-1", + messageStatus: "DELIVERED", + }, + { + weight: 1, + factory: "channelStatus", + clientId: "perf-client-1", + channelStatus: "DELIVERED", + }, + ], + }; + + const urls = deriveQueueUrls(inboundQueueUrl, duplicateScenario); + + expect(urls).toEqual([ + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-inbound-event-queue", + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-inbound-event-dlq-queue", + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-perf-client-1-delivery-queue", + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-perf-client-1-delivery-dlq-queue", + ]); + }); +}); + +describe("purgeQueues", () => { + const mockSend = jest.fn().mockResolvedValue({}); + const mockSqsClient = { send: mockSend } as unknown as SQSClient; + + beforeEach(() => { + jest.clearAllMocks(); + mockSend.mockResolvedValue({}); + }); + + it("sends a PurgeQueueCommand for each queue URL", async () => { + const urls = [ + "https://sqs.example.invalid/queue-a", + "https://sqs.example.invalid/queue-b", + ]; + + await purgeQueues(mockSqsClient, urls); + + expect(mockSend).toHaveBeenCalledTimes(2); + }); + + it("ignores NonExistentQueue errors gracefully", async () => { + const nonExistentError = Object.assign(new Error("Queue does not exist"), { + name: "AWS.SimpleQueueService.NonExistentQueue", + }); + mockSend.mockRejectedValueOnce(nonExistentError); + + await expect( + purgeQueues(mockSqsClient, ["https://sqs.example.invalid/missing"]), + ).resolves.toBeUndefined(); + }); + + it("rethrows non-NonExistentQueue errors", async () => { + const otherError = new Error("Access denied"); + mockSend.mockRejectedValueOnce(otherError); + + await expect( + purgeQueues(mockSqsClient, ["https://sqs.example.invalid/queue"]), + ).rejects.toThrow("Access denied"); + }); + + it("handles an empty queue URL list without sending any commands", async () => { + await purgeQueues(mockSqsClient, []); + + expect(mockSend).not.toHaveBeenCalled(); + }); +}); diff --git a/lambdas/perf-runner-lambda/src/__tests__/runner.test.ts b/lambdas/perf-runner-lambda/src/__tests__/runner.test.ts index 1cf5f3a3..46a0928d 100644 --- a/lambdas/perf-runner-lambda/src/__tests__/runner.test.ts +++ b/lambdas/perf-runner-lambda/src/__tests__/runner.test.ts @@ -1,6 +1,7 @@ import type { SQSClient } from "@aws-sdk/client-sqs"; import type { CloudWatchLogsClient } from "@aws-sdk/client-cloudwatch-logs"; import type { + CircuitBreakerSnapshot, DeliveryMetricsSnapshot, MetricsSnapshot, PhaseResult, @@ -10,16 +11,35 @@ import type { import { defaultSleep, runPerformanceTest } from "runner"; import { generatePhaseLoad } from "sqs"; -import { queryDeliveryMetricsSnapshot, queryMetricsSnapshot } from "cloudwatch"; +import { deriveQueueUrls, purgeQueues } from "purge"; +import { flushElastiCache } from "elasticache"; +import { verifyMockWebhook } from "webhook-verify"; +import { + queryCircuitBreakerSnapshot, + queryDeliveryMetricsSnapshot, + queryMetricsSnapshot, + queryPerClientRateTimeline, +} from "cloudwatch"; jest.mock("sqs"); jest.mock("cloudwatch"); +jest.mock("purge"); +jest.mock("elasticache"); +jest.mock("webhook-verify"); const mockGeneratePhaseLoad = jest.mocked(generatePhaseLoad); const mockQueryMetricsSnapshot = jest.mocked(queryMetricsSnapshot); const mockQueryDeliveryMetricsSnapshot = jest.mocked( queryDeliveryMetricsSnapshot, ); +const mockQueryCircuitBreakerSnapshot = jest.mocked( + queryCircuitBreakerSnapshot, +); +const mockQueryPerClientRateTimeline = jest.mocked(queryPerClientRateTimeline); +const mockDeriveQueueUrls = jest.mocked(deriveQueueUrls); +const mockPurgeQueues = jest.mocked(purgeQueues); +const mockFlushElastiCache = jest.mocked(flushElastiCache); +const mockVerifyMockWebhook = jest.mocked(verifyMockWebhook); const immediateSleep = jest.fn().mockResolvedValue(undefined); @@ -46,6 +66,20 @@ const mockDeliverySnapshot: DeliveryMetricsSnapshot = { p99Ms: 500, }; +const mockCbSnapshot: CircuitBreakerSnapshot = { + snapshotAt: Date.now(), + intervalStartSec: 0, + intervalEndSec: 60, + circuitOpenEvents: 1, + circuitCloseEvents: 0, + admissionDeniedCircuitOpen: 5, + admissionDeniedRateLimited: 3, + deliveryAttempts: 100, + deliverySuccesses: 92, + deliveryFailures: 5, + deliveryRateLimited: 3, +}; + const scenario: Scenario = { phases: [{ durationSecs: 1, targetEps: 1000 }], eventMix: [ @@ -71,6 +105,17 @@ beforeEach(() => { jest.clearAllMocks(); mockGeneratePhaseLoad.mockResolvedValue(mockPhaseResult); mockQueryDeliveryMetricsSnapshot.mockResolvedValue(null); + mockQueryCircuitBreakerSnapshot.mockResolvedValue(null); + mockQueryPerClientRateTimeline.mockResolvedValue([]); + mockDeriveQueueUrls.mockReturnValue([ + "https://sqs.example.invalid/inbound-event-queue", + ]); + mockPurgeQueues.mockResolvedValue(undefined); + mockFlushElastiCache.mockResolvedValue(undefined); + mockVerifyMockWebhook.mockResolvedValue({ + receivedCallbacks: 0, + verified: false, + }); immediateSleep.mockResolvedValue(undefined); }); @@ -78,6 +123,7 @@ describe("runPerformanceTest", () => { it("returns a PerformanceResult with phase results and snapshots from polling and final query", async () => { mockQueryMetricsSnapshot.mockResolvedValue(mockSnapshot); mockQueryDeliveryMetricsSnapshot.mockResolvedValue(mockDeliverySnapshot); + mockQueryCircuitBreakerSnapshot.mockResolvedValue(mockCbSnapshot); const result = await runPerformanceTest( deps, @@ -92,6 +138,7 @@ describe("runPerformanceTest", () => { expect(result.phases[0]).toEqual(mockPhaseResult); expect(result.metrics).toHaveLength(2); // one mid-test, one final expect(result.deliveryMetrics).toHaveLength(2); // one mid-test, one final + expect(result.circuitBreakerMetrics).toHaveLength(2); // one mid-test, one final expect(result.startedAt).toBeTruthy(); expect(result.completedAt).toBeTruthy(); }); @@ -111,6 +158,7 @@ describe("runPerformanceTest", () => { expect(result.metrics).toHaveLength(1); expect(result.metrics[0]).toEqual(mockSnapshot); expect(result.deliveryMetrics).toHaveLength(0); + expect(result.circuitBreakerMetrics).toHaveLength(0); }); it("produces an empty metrics array when all queries return null", async () => { @@ -125,6 +173,7 @@ describe("runPerformanceTest", () => { expect(result.metrics).toHaveLength(0); expect(result.deliveryMetrics).toHaveLength(0); + expect(result.circuitBreakerMetrics).toHaveLength(0); }); it("runs all phases and collects each result", async () => { @@ -267,7 +316,9 @@ describe("runPerformanceTest", () => { ); expect(mockQueryDeliveryMetricsSnapshot).not.toHaveBeenCalled(); + expect(mockQueryCircuitBreakerSnapshot).not.toHaveBeenCalled(); expect(result.deliveryMetrics).toHaveLength(0); + expect(result.circuitBreakerMetrics).toHaveLength(0); }); it("builds delivery log group names from prefix and event mix client IDs", async () => { @@ -309,6 +360,273 @@ describe("runPerformanceTest", () => { expect.any(Number), ); }); + + it("collects circuit breaker metrics when deliveryLogGroupPrefix is set", async () => { + mockQueryMetricsSnapshot.mockResolvedValue(mockSnapshot); + mockQueryDeliveryMetricsSnapshot.mockResolvedValue(mockDeliverySnapshot); + mockQueryCircuitBreakerSnapshot.mockResolvedValue(mockCbSnapshot); + + const result = await runPerformanceTest( + deps, + scenario, + "test-cb-1", + immediateSleep, + ); + + expect(result.circuitBreakerMetrics.length).toBeGreaterThanOrEqual(1); + expect(mockQueryCircuitBreakerSnapshot).toHaveBeenCalled(); + }); + + it("returns empty circuitBreakerMetrics when CB queries return null", async () => { + mockQueryMetricsSnapshot.mockResolvedValue(mockSnapshot); + mockQueryDeliveryMetricsSnapshot.mockResolvedValue(mockDeliverySnapshot); + mockQueryCircuitBreakerSnapshot.mockResolvedValue(null); + + const result = await runPerformanceTest( + deps, + scenario, + "test-cb-null", + immediateSleep, + ); + + expect(result.circuitBreakerMetrics).toHaveLength(0); + }); + + it("uses per-interval windowing for circuit breaker snapshots", async () => { + mockQueryMetricsSnapshot.mockResolvedValue(mockSnapshot); + mockQueryDeliveryMetricsSnapshot.mockResolvedValue(mockDeliverySnapshot); + mockQueryCircuitBreakerSnapshot.mockResolvedValue(mockCbSnapshot); + + let resolvePhase!: (value: PhaseResult) => void; + mockGeneratePhaseLoad.mockImplementation( + () => + new Promise((r) => { + resolvePhase = r; + }), + ); + + let sleepCount = 0; + const controlledSleep = jest.fn(async () => { + sleepCount += 1; + if (sleepCount >= 3) { + resolvePhase(mockPhaseResult); + } + }); + + await runPerformanceTest( + deps, + scenario, + "test-cb-interval", + controlledSleep, + ); + + const cbCalls = mockQueryCircuitBreakerSnapshot.mock.calls; + expect(cbCalls.length).toBeGreaterThanOrEqual(2); + const firstCallEndSec = cbCalls[0][3]; + const secondCallStartSec = cbCalls[1][2]; + expect(secondCallStartSec).toBe(firstCallEndSec); + }); + + it("collects per-client rate timelines when deliveryLogGroupPrefix is set", async () => { + mockQueryMetricsSnapshot.mockResolvedValue(mockSnapshot); + mockQueryDeliveryMetricsSnapshot.mockResolvedValue(mockDeliverySnapshot); + mockQueryPerClientRateTimeline.mockResolvedValue([ + { timestampSec: 1000, deliveryAttempts: 10 }, + ]); + + const result = await runPerformanceTest( + deps, + scenario, + "test-pcr-1", + immediateSleep, + ); + + expect(result.perClientRateTimelines).toHaveLength(1); + expect(result.perClientRateTimelines![0].clientId).toBe("perf-client-1"); + expect(result.perClientRateTimelines![0].entries).toHaveLength(1); + }); + + it("queries each client log group individually for rate timelines", async () => { + mockQueryMetricsSnapshot.mockResolvedValue(null); + mockQueryPerClientRateTimeline.mockResolvedValue([ + { timestampSec: 1000, deliveryAttempts: 5 }, + ]); + + const multiClientScenario: Scenario = { + ...scenario, + eventMix: [ + { + weight: 1, + factory: "messageStatus", + clientId: "perf-client-1", + messageStatus: "DELIVERED", + }, + { + weight: 1, + factory: "channelStatus", + clientId: "perf-client-2", + channelStatus: "DELIVERED", + }, + ], + }; + + const result = await runPerformanceTest( + deps, + multiClientScenario, + "test-pcr-multi", + immediateSleep, + ); + + expect(mockQueryPerClientRateTimeline).toHaveBeenCalledTimes(2); + expect(mockQueryPerClientRateTimeline).toHaveBeenCalledWith( + deps.cloudWatchClient, + "/aws/lambda/nhs-dev-callbacks-https-client-perf-client-1", + expect.any(Number), + expect.any(Number), + ); + expect(mockQueryPerClientRateTimeline).toHaveBeenCalledWith( + deps.cloudWatchClient, + "/aws/lambda/nhs-dev-callbacks-https-client-perf-client-2", + expect.any(Number), + expect.any(Number), + ); + expect(result.perClientRateTimelines).toHaveLength(2); + }); + + it("excludes clients with empty rate timelines", async () => { + mockQueryMetricsSnapshot.mockResolvedValue(null); + mockQueryPerClientRateTimeline + .mockResolvedValueOnce([{ timestampSec: 1000, deliveryAttempts: 5 }]) + .mockResolvedValueOnce([]); + + const multiClientScenario: Scenario = { + ...scenario, + eventMix: [ + { + weight: 1, + factory: "messageStatus", + clientId: "perf-client-1", + messageStatus: "DELIVERED", + }, + { + weight: 1, + factory: "channelStatus", + clientId: "perf-client-2", + channelStatus: "DELIVERED", + }, + ], + }; + + const result = await runPerformanceTest( + deps, + multiClientScenario, + "test-pcr-filter", + immediateSleep, + ); + + expect(result.perClientRateTimelines).toHaveLength(1); + expect(result.perClientRateTimelines![0].clientId).toBe("perf-client-1"); + }); + + it("skips per-client rate timelines when deliveryLogGroupPrefix is undefined", async () => { + const depsWithoutPrefix: RunnerDeps = { + ...deps, + deliveryLogGroupPrefix: undefined, + }; + mockQueryMetricsSnapshot.mockResolvedValue(mockSnapshot); + + const result = await runPerformanceTest( + depsWithoutPrefix, + scenario, + "test-pcr-skip", + immediateSleep, + ); + + expect(mockQueryPerClientRateTimeline).not.toHaveBeenCalled(); + expect(result.perClientRateTimelines).toHaveLength(0); + }); + + it("purges queues before and after the test run", async () => { + mockQueryMetricsSnapshot.mockResolvedValue(null); + + await runPerformanceTest(deps, scenario, "test-purge", immediateSleep); + + expect(mockDeriveQueueUrls).toHaveBeenCalledWith(deps.queueUrl, scenario); + expect(mockPurgeQueues).toHaveBeenCalledTimes(2); + }); + + it("flushes ElastiCache before and after when deps are provided", async () => { + mockQueryMetricsSnapshot.mockResolvedValue(null); + const elastiCacheDeps = { + endpoint: "cache.example.invalid", + cacheName: "test-cache", + iamUsername: "test-user", + region: "eu-west-2", + }; + + await runPerformanceTest( + deps, + scenario, + "test-flush", + immediateSleep, + elastiCacheDeps, + ); + + expect(mockFlushElastiCache).toHaveBeenCalledTimes(2); + expect(mockFlushElastiCache).toHaveBeenCalledWith(elastiCacheDeps); + }); + + it("skips ElastiCache flush when deps are not provided", async () => { + mockQueryMetricsSnapshot.mockResolvedValue(null); + + await runPerformanceTest(deps, scenario, "test-no-flush", immediateSleep); + + expect(mockFlushElastiCache).not.toHaveBeenCalled(); + }); + + it("verifies mock webhook when log group is configured", async () => { + const depsWithWebhook: RunnerDeps = { + ...deps, + mockWebhookLogGroup: "/aws/lambda/test-mock-webhook", + }; + mockQueryMetricsSnapshot.mockResolvedValue(null); + mockVerifyMockWebhook.mockResolvedValue({ + receivedCallbacks: 25, + verified: true, + }); + + const result = await runPerformanceTest( + depsWithWebhook, + scenario, + "test-webhook", + immediateSleep, + ); + + expect(mockVerifyMockWebhook).toHaveBeenCalledWith( + depsWithWebhook.cloudWatchClient, + "/aws/lambda/test-mock-webhook", + expect.any(Number), + expect.any(Number), + ); + expect(result.webhookVerification).toEqual({ + receivedCallbacks: 25, + verified: true, + }); + }); + + it("omits webhook verification when log group is not configured", async () => { + mockQueryMetricsSnapshot.mockResolvedValue(null); + + const result = await runPerformanceTest( + deps, + scenario, + "test-no-webhook", + immediateSleep, + ); + + expect(mockVerifyMockWebhook).not.toHaveBeenCalled(); + expect(result.webhookVerification).toBeUndefined(); + }); }); describe("defaultSleep", () => { diff --git a/lambdas/perf-runner-lambda/src/__tests__/webhook-verify.test.ts b/lambdas/perf-runner-lambda/src/__tests__/webhook-verify.test.ts new file mode 100644 index 00000000..72c49870 --- /dev/null +++ b/lambdas/perf-runner-lambda/src/__tests__/webhook-verify.test.ts @@ -0,0 +1,173 @@ +import type { CloudWatchLogsClient } from "@aws-sdk/client-cloudwatch-logs"; +import { verifyMockWebhook } from "webhook-verify"; + +const mockSend = jest.fn(); +const mockClient = { send: mockSend } as unknown as CloudWatchLogsClient; + +beforeEach(() => { + jest.clearAllMocks(); +}); + +describe("verifyMockWebhook", () => { + it("returns verified=true when callbacks are found", async () => { + mockSend.mockResolvedValueOnce({ queryId: "q-1" }).mockResolvedValueOnce({ + status: "Complete", + results: [[{ field: "callbackCount", value: "42" }]], + }); + + const result = await verifyMockWebhook( + mockClient, + "/aws/lambda/test-mock-webhook", + 1000, + 2000, + ); + + expect(result).toEqual({ receivedCallbacks: 42, verified: true }); + }); + + it("returns verified=false when no callbacks are found", async () => { + mockSend.mockResolvedValueOnce({ queryId: "q-2" }).mockResolvedValueOnce({ + status: "Complete", + results: [[{ field: "callbackCount", value: "0" }]], + }); + + const result = await verifyMockWebhook( + mockClient, + "/aws/lambda/test-mock-webhook", + 1000, + 2000, + ); + + expect(result).toEqual({ receivedCallbacks: 0, verified: false }); + }); + + it("returns verified=false when query fails", async () => { + mockSend + .mockResolvedValueOnce({ queryId: "q-3" }) + .mockResolvedValueOnce({ status: "Failed" }); + + const result = await verifyMockWebhook( + mockClient, + "/aws/lambda/test-mock-webhook", + 1000, + 2000, + ); + + expect(result).toEqual({ receivedCallbacks: 0, verified: false }); + }); + + it("returns verified=false when no queryId is returned", async () => { + mockSend.mockResolvedValueOnce({}); + + const result = await verifyMockWebhook( + mockClient, + "/aws/lambda/test-mock-webhook", + 1000, + 2000, + ); + + expect(result).toEqual({ receivedCallbacks: 0, verified: false }); + }); + + it("returns verified=false when results are empty", async () => { + mockSend.mockResolvedValueOnce({ queryId: "q-4" }).mockResolvedValueOnce({ + status: "Complete", + results: [], + }); + + const result = await verifyMockWebhook( + mockClient, + "/aws/lambda/test-mock-webhook", + 1000, + 2000, + ); + + expect(result).toEqual({ receivedCallbacks: 0, verified: false }); + }); + + it("returns verified=false when results field is undefined", async () => { + mockSend.mockResolvedValueOnce({ queryId: "q-4b" }).mockResolvedValueOnce({ + status: "Complete", + results: undefined, + }); + + const result = await verifyMockWebhook( + mockClient, + "/aws/lambda/test-mock-webhook", + 1000, + 2000, + ); + + expect(result).toEqual({ receivedCallbacks: 0, verified: false }); + }); + + it("polls until the query completes", async () => { + mockSend + .mockResolvedValueOnce({ queryId: "q-5" }) + .mockResolvedValueOnce({ status: "Running" }) + .mockResolvedValueOnce({ + status: "Complete", + results: [[{ field: "callbackCount", value: "10" }]], + }); + + const result = await verifyMockWebhook( + mockClient, + "/aws/lambda/test-mock-webhook", + 1000, + 2000, + ); + + expect(result).toEqual({ receivedCallbacks: 10, verified: true }); + expect(mockSend).toHaveBeenCalledTimes(3); + }); + + it("returns verified=false when query is cancelled", async () => { + mockSend + .mockResolvedValueOnce({ queryId: "q-6" }) + .mockResolvedValueOnce({ status: "Cancelled" }); + + const result = await verifyMockWebhook( + mockClient, + "/aws/lambda/test-mock-webhook", + 1000, + 2000, + ); + + expect(result).toEqual({ receivedCallbacks: 0, verified: false }); + }); + + it("returns verified=false when polling times out", async () => { + jest.useFakeTimers(); + + mockSend.mockResolvedValueOnce({ queryId: "q-7" }).mockImplementation( + () => + new Promise((resolve) => { + setTimeout(() => resolve({ status: "Running" }), 1000); + }), + ); + + const originalDateNow = Date.now; + let callCount = 0; + jest.spyOn(Date, "now").mockImplementation(() => { + callCount += 1; + if (callCount <= 1) return originalDateNow.call(Date); + return originalDateNow.call(Date) + 60_000; + }); + + const promise = verifyMockWebhook( + mockClient, + "/aws/lambda/test-mock-webhook", + 1000, + 2000, + ); + + await jest.advanceTimersByTimeAsync(60_000); + + const result = await promise; + + expect(result).toEqual({ receivedCallbacks: 0, verified: false }); + + jest.useRealTimers(); + jest.restoreAllMocks(); + }); +}); diff --git a/lambdas/perf-runner-lambda/src/cloudwatch.ts b/lambdas/perf-runner-lambda/src/cloudwatch.ts index 206bec33..598f5f3f 100644 --- a/lambdas/perf-runner-lambda/src/cloudwatch.ts +++ b/lambdas/perf-runner-lambda/src/cloudwatch.ts @@ -3,19 +3,22 @@ import { GetQueryResultsCommand, StartQueryCommand, } from "@aws-sdk/client-cloudwatch-logs"; -import type { DeliveryMetricsSnapshot, MetricsSnapshot } from "types"; +import type { + CircuitBreakerSnapshot, + DeliveryMetricsSnapshot, + MetricsSnapshot, + PerClientRateEntry, +} from "types"; const INSIGHTS_POLL_INTERVAL_MS = 2000; const INSIGHTS_TIMEOUT_MS = 30_000; type ResultField = { field?: string; value?: string }; -async function pollQueryResults( +async function pollInsightsQuery( client: CloudWatchLogsClient, queryId: string, - mapRow: (row: ResultField[]) => T, -): Promise { - const zeroResult = mapRow([]); +): Promise { const deadline = Date.now() + INSIGHTS_TIMEOUT_MS; while (Date.now() < deadline) { @@ -30,15 +33,33 @@ async function pollQueryResults( } if (response.status === "Complete") { - const row = response.results?.[0]; - if (!row) return zeroResult; - return mapRow(row); + return (response.results as ResultField[][]) ?? []; } } return null; } +async function pollQueryResults( + client: CloudWatchLogsClient, + queryId: string, + mapRow: (row: ResultField[]) => T, +): Promise { + const rows = await pollInsightsQuery(client, queryId); + if (rows === null) return null; + return mapRow(rows[0] ?? []); +} + +async function pollAllQueryResults( + client: CloudWatchLogsClient, + queryId: string, + mapRow: (row: ResultField[]) => T, +): Promise { + const rows = await pollInsightsQuery(client, queryId); + if (rows === null) return []; + return rows.map((row) => mapRow(row)); +} + export async function queryMetricsSnapshot( client: CloudWatchLogsClient, logGroupName: string, @@ -108,3 +129,86 @@ export async function queryDeliveryMetricsSnapshot( }; }); } + +export async function queryCircuitBreakerSnapshot( + client: CloudWatchLogsClient, + logGroupNames: string[], + startTimeSec: number, + endTimeSec: number, +): Promise { + if (logGroupNames.length === 0) return null; + + const { queryId } = await client.send( + new StartQueryCommand({ + logGroupNames, + startTime: startTimeSec, + endTime: endTimeSec, + queryString: [ + 'filter msg in ["Circuit breaker opened", "Circuit breaker closed", "Admission denied", "Attempting delivery", "Delivery succeeded", "Transient delivery failure \u2014 requeuing", "Permanent delivery failure \u2014 sending to DLQ", "Rate limited (429)"]', + '| stats sum(msg = "Circuit breaker opened") as circuitOpenEvents,', + ' sum(msg = "Circuit breaker closed") as circuitCloseEvents,', + ' sum(msg = "Admission denied" and reason = "circuit_open") as admissionDeniedCircuitOpen,', + ' sum(msg = "Admission denied" and reason = "rate_limited") as admissionDeniedRateLimited,', + ' sum(msg = "Attempting delivery") as deliveryAttempts,', + ' sum(msg = "Delivery succeeded") as deliverySuccesses,', + ' sum(msg in ["Transient delivery failure \u2014 requeuing", "Permanent delivery failure \u2014 sending to DLQ"]) as deliveryFailures,', + ' sum(msg = "Rate limited (429)") as deliveryRateLimited', + ].join("\n"), + }), + ); + + if (!queryId) return null; + + return pollQueryResults(client, queryId, (row) => { + const getField = (name: string): number => + Number(row.find((f) => f.field === name)?.value ?? 0); + + return { + snapshotAt: Date.now(), + intervalStartSec: startTimeSec, + intervalEndSec: endTimeSec, + circuitOpenEvents: getField("circuitOpenEvents"), + circuitCloseEvents: getField("circuitCloseEvents"), + admissionDeniedCircuitOpen: getField("admissionDeniedCircuitOpen"), + admissionDeniedRateLimited: getField("admissionDeniedRateLimited"), + deliveryAttempts: getField("deliveryAttempts"), + deliverySuccesses: getField("deliverySuccesses"), + deliveryFailures: getField("deliveryFailures"), + deliveryRateLimited: getField("deliveryRateLimited"), + }; + }); +} + +const RATE_TIMELINE_BIN_SECONDS = 10; + +export async function queryPerClientRateTimeline( + client: CloudWatchLogsClient, + logGroupName: string, + startTimeSec: number, + endTimeSec: number, +): Promise { + const { queryId } = await client.send( + new StartQueryCommand({ + logGroupName, + startTime: startTimeSec, + endTime: endTimeSec, + queryString: [ + 'filter msg in ["Attempting delivery", "Admission denied"]', + `| stats sum(msg = "Attempting delivery") as deliveryAttempts by bin(@timestamp, ${RATE_TIMELINE_BIN_SECONDS}s) as timeBin`, + "| sort timeBin asc", + ].join("\n"), + }), + ); + + if (!queryId) return []; + + return pollAllQueryResults(client, queryId, (row) => { + const timeBinStr = row.find((f) => f.field === "timeBin")?.value ?? "0"; + const timestampSec = Math.floor(new Date(timeBinStr).getTime() / 1000); + const deliveryAttempts = Number( + row.find((f) => f.field === "deliveryAttempts")?.value ?? 0, + ); + + return { timestampSec, deliveryAttempts }; + }); +} diff --git a/lambdas/perf-runner-lambda/src/elasticache.ts b/lambdas/perf-runner-lambda/src/elasticache.ts new file mode 100644 index 00000000..8d0b86c6 --- /dev/null +++ b/lambdas/perf-runner-lambda/src/elasticache.ts @@ -0,0 +1,52 @@ +import { type RedisClientType, createClient } from "@redis/client"; +import { SignatureV4 } from "@smithy/signature-v4"; +import { Sha256 } from "@aws-crypto/sha256-js"; +import { fromNodeProviderChain } from "@aws-sdk/credential-providers"; +import type { ElastiCacheDeps } from "types"; + +const TOKEN_EXPIRY_SECONDS = 900; + +async function generateIamToken(deps: ElastiCacheDeps): Promise { + const signer = new SignatureV4({ + credentials: fromNodeProviderChain(), + region: deps.region, + service: "elasticache", + sha256: Sha256, + }); + + const signed = await signer.presign( + { + protocol: "https:", + method: "GET", + hostname: deps.cacheName, + path: "/", + query: { Action: "connect", User: deps.iamUsername }, + headers: { host: deps.cacheName }, + }, + { expiresIn: TOKEN_EXPIRY_SECONDS }, + ); + + const qs = new URLSearchParams( + signed.query as Record, + ).toString(); + return `${deps.cacheName}/?${qs}`; +} + +export async function flushElastiCache(deps: ElastiCacheDeps): Promise { + const token = await generateIamToken(deps); + + const client: RedisClientType = createClient({ + url: `rediss://${deps.endpoint}:6379`, + username: deps.iamUsername, + password: token, + }); + + try { + await client.connect(); + await client.flushAll(); + } finally { + if (client.isOpen) { + await client.disconnect(); + } + } +} diff --git a/lambdas/perf-runner-lambda/src/index.ts b/lambdas/perf-runner-lambda/src/index.ts index a0881866..5974627b 100644 --- a/lambdas/perf-runner-lambda/src/index.ts +++ b/lambdas/perf-runner-lambda/src/index.ts @@ -3,7 +3,11 @@ import { SQSClient } from "@aws-sdk/client-sqs"; import { Logger } from "@nhs-notify-client-callbacks/logger"; import { runPerformanceTest } from "runner"; import { DEFAULT_SCENARIO } from "scenario"; -import type { PerfRunnerPayload, PerformanceResult } from "types"; +import type { + ElastiCacheDeps, + PerfRunnerPayload, + PerformanceResult, +} from "types"; const logger = new Logger(); @@ -16,6 +20,10 @@ export async function handler( const queueUrl = process.env.INBOUND_QUEUE_URL; const logGroupName = process.env.TRANSFORM_FILTER_LOG_GROUP; const deliveryLogGroupPrefix = process.env.DELIVERY_LOG_GROUP_PREFIX; + const mockWebhookLogGroup = process.env.MOCK_WEBHOOK_LOG_GROUP; + const elasticacheEndpoint = process.env.ELASTICACHE_ENDPOINT; + const elasticacheCacheName = process.env.ELASTICACHE_CACHE_NAME; + const elasticacheIamUsername = process.env.ELASTICACHE_IAM_USERNAME; if (!queueUrl) { throw new Error("Missing required environment variable: INBOUND_QUEUE_URL"); @@ -30,6 +38,16 @@ export async function handler( const sqsClient = new SQSClient({ region }); const cloudWatchClient = new CloudWatchLogsClient({ region }); + const elastiCacheDeps: ElastiCacheDeps | undefined = + elasticacheEndpoint && elasticacheCacheName && elasticacheIamUsername + ? { + endpoint: elasticacheEndpoint, + cacheName: elasticacheCacheName, + iamUsername: elasticacheIamUsername, + region, + } + : undefined; + logger.info("Performance test started", { testId }); try { @@ -40,9 +58,12 @@ export async function handler( queueUrl, logGroupName, deliveryLogGroupPrefix, + mockWebhookLogGroup, }, scenario, testId, + undefined, + elastiCacheDeps, ); logger.info("Performance test completed", { testId }); diff --git a/lambdas/perf-runner-lambda/src/purge.ts b/lambdas/perf-runner-lambda/src/purge.ts new file mode 100644 index 00000000..5743e9d2 --- /dev/null +++ b/lambdas/perf-runner-lambda/src/purge.ts @@ -0,0 +1,40 @@ +import { PurgeQueueCommand, type SQSClient } from "@aws-sdk/client-sqs"; +import type { Scenario } from "types"; + +export function deriveQueueUrls( + inboundQueueUrl: string, + scenario: Scenario, +): string[] { + // eslint-disable-next-line sonarjs/null-dereference -- String.replace always returns a string + const baseUrl = inboundQueueUrl.replace(/inbound-event-queue$/, ""); + const clientIds = [...new Set(scenario.eventMix.map((e) => e.clientId))]; + + return [ + inboundQueueUrl, + `${baseUrl}inbound-event-dlq-queue`, + ...clientIds.flatMap((id) => [ + `${baseUrl}${id}-delivery-queue`, + `${baseUrl}${id}-delivery-dlq-queue`, + ]), + ]; +} + +export async function purgeQueues( + client: SQSClient, + queueUrls: string[], +): Promise { + const results = await Promise.allSettled( + queueUrls.map((url) => + client.send(new PurgeQueueCommand({ QueueUrl: url })), + ), + ); + + for (const result of results) { + if (result.status === "rejected") { + const error = result.reason as { name?: string }; + if (error.name !== "AWS.SimpleQueueService.NonExistentQueue") { + throw result.reason as Error; + } + } + } +} diff --git a/lambdas/perf-runner-lambda/src/runner.ts b/lambdas/perf-runner-lambda/src/runner.ts index a265e90e..321abc45 100644 --- a/lambdas/perf-runner-lambda/src/runner.ts +++ b/lambdas/perf-runner-lambda/src/runner.ts @@ -1,13 +1,25 @@ import type { + CircuitBreakerSnapshot, DeliveryMetricsSnapshot, + ElastiCacheDeps, MetricsSnapshot, + PerClientRateTimeline, PerformanceResult, PhaseResult, RunnerDeps, Scenario, + WebhookVerificationResult, } from "types"; import { generatePhaseLoad } from "sqs"; -import { queryDeliveryMetricsSnapshot, queryMetricsSnapshot } from "cloudwatch"; +import { deriveQueueUrls, purgeQueues } from "purge"; +import { flushElastiCache } from "elasticache"; +import { verifyMockWebhook } from "webhook-verify"; +import { + queryCircuitBreakerSnapshot, + queryDeliveryMetricsSnapshot, + queryMetricsSnapshot, + queryPerClientRateTimeline, +} from "cloudwatch"; const CLOUDWATCH_SETTLING_MS = 60_000; @@ -25,11 +37,56 @@ function buildDeliveryLogGroupNames( return [...clientIds].map((id) => `${prefix}${id}`); } +async function collectSnapshots( + deps: RunnerDeps, + deliveryLogGroupNames: string[], + startSec: number, + endSec: number, + cbStartSec: number, + out: { + snapshots: MetricsSnapshot[]; + deliverySnapshots: DeliveryMetricsSnapshot[]; + cbSnapshots: CircuitBreakerSnapshot[]; + }, +): Promise { + const snap = await queryMetricsSnapshot( + deps.cloudWatchClient, + deps.logGroupName, + startSec, + endSec, + ); + if (snap !== null) out.snapshots.push(snap); + + if (deliveryLogGroupNames.length > 0) { + const deliverySnap = await queryDeliveryMetricsSnapshot( + deps.cloudWatchClient, + deliveryLogGroupNames, + startSec, + endSec, + ); + if (deliverySnap !== null) out.deliverySnapshots.push(deliverySnap); + + const cbSnap = await queryCircuitBreakerSnapshot( + deps.cloudWatchClient, + deliveryLogGroupNames, + cbStartSec, + endSec, + ); + if (cbSnap !== null) { + out.cbSnapshots.push(cbSnap); + return endSec; + } + } + + return cbStartSec; +} + export async function runPerformanceTest( deps: RunnerDeps, scenario: Scenario, testId: string, sleepFn: (ms: number) => Promise = defaultSleep, + elastiCacheDeps?: ElastiCacheDeps, ): Promise { if (scenario.eventMix.length === 0) { throw new Error("scenario.eventMix must contain at least one entry"); @@ -49,10 +106,19 @@ export async function runPerformanceTest( } const testStartMs = Date.now(); + + const queueUrls = deriveQueueUrls(deps.queueUrl, scenario); + await purgeQueues(deps.sqsClient, queueUrls); + if (elastiCacheDeps) { + await flushElastiCache(elastiCacheDeps); + } + const startedAt = new Date(testStartMs).toISOString(); const phaseResults: PhaseResult[] = []; const snapshots: MetricsSnapshot[] = []; const deliverySnapshots: DeliveryMetricsSnapshot[] = []; + const cbSnapshots: CircuitBreakerSnapshot[] = []; + let lastCbSnapshotSec = Math.floor(testStartMs / 1000); let stopPolling = false; const deliveryLogGroupNames = buildDeliveryLogGroupNames( @@ -60,29 +126,22 @@ export async function runPerformanceTest( scenario, ); + const out = { snapshots, deliverySnapshots, cbSnapshots }; + const pollLoop = async (): Promise => { await sleepFn(scenario.metricsIntervalSecs * 1000); while (!stopPolling) { const startSec = Math.floor(testStartMs / 1000); const endSec = Math.floor(Date.now() / 1000); - const snap = await queryMetricsSnapshot( - deps.cloudWatchClient, - deps.logGroupName, + lastCbSnapshotSec = await collectSnapshots( + deps, + deliveryLogGroupNames, startSec, endSec, + lastCbSnapshotSec, + out, ); - if (snap !== null) snapshots.push(snap); - - if (deliveryLogGroupNames.length > 0) { - const deliverySnap = await queryDeliveryMetricsSnapshot( - deps.cloudWatchClient, - deliveryLogGroupNames, - startSec, - endSec, - ); - if (deliverySnap !== null) deliverySnapshots.push(deliverySnap); - } if (!stopPolling) { await sleepFn(scenario.metricsIntervalSecs * 1000); @@ -110,22 +169,48 @@ export async function runPerformanceTest( const finalStartSec = Math.floor(testStartMs / 1000); const finalEndSec = Math.floor(Date.now() / 1000); - const finalSnap = await queryMetricsSnapshot( - deps.cloudWatchClient, - deps.logGroupName, + await collectSnapshots( + deps, + deliveryLogGroupNames, finalStartSec, finalEndSec, + lastCbSnapshotSec, + out, ); - if (finalSnap !== null) snapshots.push(finalSnap); - if (deliveryLogGroupNames.length > 0) { - const finalDeliverySnap = await queryDeliveryMetricsSnapshot( + const perClientRateTimelines: PerClientRateTimeline[] = []; + + if (deps.deliveryLogGroupPrefix) { + const clientIds = [...new Set(scenario.eventMix.map((e) => e.clientId))]; + const timelinePromises = clientIds.map(async (clientId) => { + const logGroupName = `${deps.deliveryLogGroupPrefix}${clientId}`; + const entries = await queryPerClientRateTimeline( + deps.cloudWatchClient, + logGroupName, + finalStartSec, + finalEndSec, + ); + return { clientId, entries }; + }); + const timelines = await Promise.all(timelinePromises); + perClientRateTimelines.push( + ...timelines.filter((t) => t.entries.length > 0), + ); + } + + let webhookVerification: WebhookVerificationResult | undefined; + if (deps.mockWebhookLogGroup) { + webhookVerification = await verifyMockWebhook( deps.cloudWatchClient, - deliveryLogGroupNames, + deps.mockWebhookLogGroup, finalStartSec, finalEndSec, ); - if (finalDeliverySnap !== null) deliverySnapshots.push(finalDeliverySnap); + } + + await purgeQueues(deps.sqsClient, queueUrls); + if (elastiCacheDeps) { + await flushElastiCache(elastiCacheDeps); } return { @@ -136,5 +221,8 @@ export async function runPerformanceTest( phases: phaseResults, metrics: snapshots, deliveryMetrics: deliverySnapshots, + circuitBreakerMetrics: cbSnapshots, + perClientRateTimelines, + webhookVerification, }; } diff --git a/lambdas/perf-runner-lambda/src/types.ts b/lambdas/perf-runner-lambda/src/types.ts index 5366602d..24df2a50 100644 --- a/lambdas/perf-runner-lambda/src/types.ts +++ b/lambdas/perf-runner-lambda/src/types.ts @@ -55,6 +55,35 @@ export type DeliveryMetricsSnapshot = { p99Ms: number; }; +export type CircuitBreakerSnapshot = { + snapshotAt: number; + intervalStartSec: number; + intervalEndSec: number; + circuitOpenEvents: number; + circuitCloseEvents: number; + admissionDeniedCircuitOpen: number; + admissionDeniedRateLimited: number; + deliveryAttempts: number; + deliverySuccesses: number; + deliveryFailures: number; + deliveryRateLimited: number; +}; + +export type PerClientRateEntry = { + timestampSec: number; + deliveryAttempts: number; +}; + +export type PerClientRateTimeline = { + clientId: string; + entries: PerClientRateEntry[]; +}; + +export type WebhookVerificationResult = { + receivedCallbacks: number; + verified: boolean; +}; + export type PerformanceResult = { testId: string; scenario: Scenario; @@ -63,6 +92,9 @@ export type PerformanceResult = { phases: PhaseResult[]; metrics: MetricsSnapshot[]; deliveryMetrics: DeliveryMetricsSnapshot[]; + circuitBreakerMetrics: CircuitBreakerSnapshot[]; + perClientRateTimelines?: PerClientRateTimeline[]; + webhookVerification?: WebhookVerificationResult; }; export type PerfRunnerPayload = { @@ -76,4 +108,12 @@ export type RunnerDeps = { queueUrl: string; logGroupName: string; deliveryLogGroupPrefix?: string; + mockWebhookLogGroup?: string; +}; + +export type ElastiCacheDeps = { + endpoint: string; + cacheName: string; + iamUsername: string; + region: string; }; diff --git a/lambdas/perf-runner-lambda/src/webhook-verify.ts b/lambdas/perf-runner-lambda/src/webhook-verify.ts new file mode 100644 index 00000000..77c1fa6d --- /dev/null +++ b/lambdas/perf-runner-lambda/src/webhook-verify.ts @@ -0,0 +1,59 @@ +import { + type CloudWatchLogsClient, + GetQueryResultsCommand, + StartQueryCommand, +} from "@aws-sdk/client-cloudwatch-logs"; +import type { WebhookVerificationResult } from "types"; + +const INSIGHTS_POLL_INTERVAL_MS = 2000; +const INSIGHTS_TIMEOUT_MS = 30_000; + +export async function verifyMockWebhook( + client: CloudWatchLogsClient, + logGroupName: string, + startTimeSec: number, + endTimeSec: number, +): Promise { + const { queryId } = await client.send( + new StartQueryCommand({ + logGroupName, + startTime: startTimeSec, + endTime: endTimeSec, + queryString: [ + 'filter msg = "Callback received"', + "| stats count(*) as callbackCount", + ].join("\n"), + }), + ); + + if (!queryId) { + return { receivedCallbacks: 0, verified: false }; + } + + const deadline = Date.now() + INSIGHTS_TIMEOUT_MS; + + while (Date.now() < deadline) { + await new Promise((resolve) => { + setTimeout(resolve, INSIGHTS_POLL_INTERVAL_MS); + }); + + const response = await client.send(new GetQueryResultsCommand({ queryId })); + + if (response.status === "Failed" || response.status === "Cancelled") { + return { receivedCallbacks: 0, verified: false }; + } + + if (response.status === "Complete") { + const rows = + (response.results as { field?: string; value?: string }[][]) ?? []; + const row = rows[0] ?? []; + const count = Number( + row.find((f) => f.field === "callbackCount")?.value ?? 0, + ); + + return { receivedCallbacks: count, verified: count > 0 }; + } + } + + return { receivedCallbacks: 0, verified: false }; +} diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index c497eafb..f2b2aa3a 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -450,21 +450,30 @@ importers: lambdas/perf-runner-lambda: dependencies: + '@aws-crypto/sha256-js': + specifier: catalog:aws + version: 5.2.0 '@aws-sdk/client-cloudwatch-logs': specifier: catalog:aws version: 3.1026.0 '@aws-sdk/client-sqs': specifier: catalog:aws version: 3.1026.0 + '@aws-sdk/credential-providers': + specifier: catalog:aws + version: 3.1026.0 '@nhs-notify-client-callbacks/logger': specifier: workspace:* version: link:../../src/logger '@nhs-notify-client-callbacks/models': specifier: workspace:* version: link:../../src/models - esbuild: - specifier: catalog:tools - version: 0.28.0 + '@redis/client': + specifier: catalog:app + version: 1.6.1 + '@smithy/signature-v4': + specifier: catalog:aws + version: 5.3.13 devDependencies: '@tsconfig/node22': specifier: catalog:tools @@ -478,6 +487,9 @@ importers: '@types/node': specifier: catalog:tools version: 25.6.0 + esbuild: + specifier: catalog:tools + version: 0.28.0 eslint: specifier: catalog:lint version: 9.39.4(jiti@2.6.1) From 07410f72626213d7ba3b1936ae92e904c1ad5702 Mon Sep 17 00:00:00 2001 From: Rhys Cox Date: Mon, 27 Apr 2026 10:02:29 +0100 Subject: [PATCH 2/3] CCM-16073 - Fixed perf runner permissions --- .../callbacks/module_perf_runner_lambda.tf | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/infrastructure/terraform/components/callbacks/module_perf_runner_lambda.tf b/infrastructure/terraform/components/callbacks/module_perf_runner_lambda.tf index f3f57981..7a77c40c 100644 --- a/infrastructure/terraform/components/callbacks/module_perf_runner_lambda.tf +++ b/infrastructure/terraform/components/callbacks/module_perf_runner_lambda.tf @@ -143,4 +143,19 @@ data "aws_iam_policy_document" "perf_runner_lambda" { aws_elasticache_user.delivery_state_iam.arn, ] } + + statement { + sid = "VPCNetworkInterfacePermissions" + effect = "Allow" + + actions = [ + "ec2:CreateNetworkInterface", + "ec2:DeleteNetworkInterface", + "ec2:DescribeNetworkInterfaces", + ] + + resources = [ + "*", + ] + } } From 6673cca4d97f6fa3844d2e5e70c6ef7ab0e01619 Mon Sep 17 00:00:00 2001 From: Rhys Cox Date: Tue, 28 Apr 2026 09:53:51 +0100 Subject: [PATCH 3/3] CCM-16073 - Updated rate limiting behaviour --- .../terraform/components/callbacks/README.md | 1 + .../callbacks/module_client_delivery.tf | 2 ++ .../components/callbacks/variables.tf | 6 ++++ .../modules/client-delivery/README.md | 1 + .../module_https_client_lambda.tf | 1 + .../modules/client-delivery/variables.tf | 6 ++++ .../src/__tests__/endpoint-gate.test.ts | 28 ++++++++++++++----- .../src/__tests__/handler.test.ts | 27 ++++++++++++++++-- .../src/__tests__/record-result-lua.test.ts | 6 ++-- .../src/services/endpoint-gate.ts | 4 +-- .../src/services/record-result.lua | 3 +- 11 files changed, 69 insertions(+), 16 deletions(-) diff --git a/infrastructure/terraform/components/callbacks/README.md b/infrastructure/terraform/components/callbacks/README.md index 02804698..e090abb9 100644 --- a/infrastructure/terraform/components/callbacks/README.md +++ b/infrastructure/terraform/components/callbacks/README.md @@ -45,6 +45,7 @@ | [s3\_enable\_force\_destroy](#input\_s3\_enable\_force\_destroy) | Whether to enable force destroy for the S3 buckets created in this module | `bool` | `false` | no | | [sqs\_inbound\_event\_max\_receive\_count](#input\_sqs\_inbound\_event\_max\_receive\_count) | n/a | `number` | `3` | no | | [sqs\_inbound\_event\_visibility\_timeout\_seconds](#input\_sqs\_inbound\_event\_visibility\_timeout\_seconds) | n/a | `number` | `60` | no | +| [token\_bucket\_burst\_capacity](#input\_token\_bucket\_burst\_capacity) | Token bucket burst capacity used by the rate limiter | `number` | `2250` | no | ## Modules | Name | Source | Version | diff --git a/infrastructure/terraform/components/callbacks/module_client_delivery.tf b/infrastructure/terraform/components/callbacks/module_client_delivery.tf index ebc2e9e1..5122606e 100644 --- a/infrastructure/terraform/components/callbacks/module_client_delivery.tf +++ b/infrastructure/terraform/components/callbacks/module_client_delivery.tf @@ -41,6 +41,8 @@ module "client_delivery" { mtls_test_cert_s3_key = local.mtls_test_cert_s3_key # gitleaks:allow mtls_test_ca_s3_key = local.mtls_test_ca_s3_key # gitleaks:allow + token_bucket_burst_capacity = var.token_bucket_burst_capacity + vpc_subnet_ids = try(local.acct.private_subnets[local.bc_name], []) lambda_security_group_id = aws_security_group.https_client_lambda.id } diff --git a/infrastructure/terraform/components/callbacks/variables.tf b/infrastructure/terraform/components/callbacks/variables.tf index 9c71492d..aef32373 100644 --- a/infrastructure/terraform/components/callbacks/variables.tf +++ b/infrastructure/terraform/components/callbacks/variables.tf @@ -195,3 +195,9 @@ variable "elasticache_data_storage_maximum_gb" { description = "Maximum data storage in GB for the ElastiCache Serverless delivery state cache" default = 1 } + +variable "token_bucket_burst_capacity" { + type = number + description = "Token bucket burst capacity used by the rate limiter" + default = 2250 +} diff --git a/infrastructure/terraform/modules/client-delivery/README.md b/infrastructure/terraform/modules/client-delivery/README.md index 0a4965e7..2036c60d 100644 --- a/infrastructure/terraform/modules/client-delivery/README.md +++ b/infrastructure/terraform/modules/client-delivery/README.md @@ -45,6 +45,7 @@ No requirements. | [sqs\_visibility\_timeout\_seconds](#input\_sqs\_visibility\_timeout\_seconds) | Visibility timeout for the per-client delivery queue | `number` | `60` | no | | [subscription\_targets](#input\_subscription\_targets) | Flattened subscription-target fanout map keyed by subscription-target composite key |
map(object({
subscription_id = string
target_id = string
}))
| n/a | yes | | [subscriptions](#input\_subscriptions) | Subscription definitions for this client, keyed by subscription\_id |
map(object({
subscription_id = string
target_ids = list(string)
}))
| n/a | yes | +| [token\_bucket\_burst\_capacity](#input\_token\_bucket\_burst\_capacity) | Token bucket burst capacity used by the rate limiter | `number` | `2250` | no | | [vpc\_subnet\_ids](#input\_vpc\_subnet\_ids) | VPC subnet IDs for Lambda execution | `list(string)` | `[]` | no | ## Modules diff --git a/infrastructure/terraform/modules/client-delivery/module_https_client_lambda.tf b/infrastructure/terraform/modules/client-delivery/module_https_client_lambda.tf index 1260d471..0021fb80 100644 --- a/infrastructure/terraform/modules/client-delivery/module_https_client_lambda.tf +++ b/infrastructure/terraform/modules/client-delivery/module_https_client_lambda.tf @@ -53,6 +53,7 @@ module "https_client_lambda" { MTLS_TEST_CERT_S3_BUCKET = var.mtls_test_cert_s3_bucket MTLS_TEST_CERT_S3_KEY = var.mtls_test_cert_s3_key # gitleaks:allow QUEUE_URL = module.sqs_delivery.sqs_queue_url + TOKEN_BUCKET_BURST_CAPACITY = tostring(var.token_bucket_burst_capacity) } vpc_config = var.lambda_security_group_id != "" ? { diff --git a/infrastructure/terraform/modules/client-delivery/variables.tf b/infrastructure/terraform/modules/client-delivery/variables.tf index 643e163e..801ca291 100644 --- a/infrastructure/terraform/modules/client-delivery/variables.tf +++ b/infrastructure/terraform/modules/client-delivery/variables.tf @@ -181,6 +181,12 @@ variable "mtls_test_ca_s3_key" { default = "" } +variable "token_bucket_burst_capacity" { + type = number + description = "Token bucket burst capacity used by the rate limiter" + default = 2250 +} + variable "elasticache_endpoint" { type = string description = "ElastiCache Serverless endpoint URL" diff --git a/lambdas/https-client-lambda/src/__tests__/endpoint-gate.test.ts b/lambdas/https-client-lambda/src/__tests__/endpoint-gate.test.ts index 2cc8cc31..c8327c3a 100644 --- a/lambdas/https-client-lambda/src/__tests__/endpoint-gate.test.ts +++ b/lambdas/https-client-lambda/src/__tests__/endpoint-gate.test.ts @@ -197,8 +197,8 @@ describe("evalScript", () => { }); describe("recordResult", () => { - it("returns closed on success below threshold", async () => { - mockSendCommand.mockResolvedValueOnce([1, "closed"]); + it("returns ok on steady-state success below threshold", async () => { + mockSendCommand.mockResolvedValueOnce([1, "ok"]); const result = await recordResult( mockRedis, @@ -208,7 +208,7 @@ describe("recordResult", () => { defaultConfig, ); - expect(result).toEqual({ ok: true, state: "closed" }); + expect(result).toEqual({ ok: true, state: "ok" }); expect(mockSendCommand).toHaveBeenCalledWith( expect.arrayContaining(["EVALSHA"]), ); @@ -228,6 +228,20 @@ describe("recordResult", () => { expect(result).toEqual({ ok: false, state: "opened" }); }); + it("returns closed when circuit transitions from open to closed", async () => { + mockSendCommand.mockResolvedValueOnce([1, "closed"]); + + const result = await recordResult( + mockRedis, + "target-1", + 5, + 0, + defaultConfig, + ); + + expect(result).toEqual({ ok: true, state: "closed" }); + }); + it("returns failed when failure is below threshold", async () => { mockSendCommand.mockResolvedValueOnce([0, "failed"]); @@ -245,7 +259,7 @@ describe("recordResult", () => { it("falls back to EVAL on NOSCRIPT error", async () => { mockSendCommand .mockRejectedValueOnce(new Error("NOSCRIPT No matching script")) - .mockResolvedValueOnce([1, "closed"]); + .mockResolvedValueOnce([1, "ok"]); const result = await recordResult( mockRedis, @@ -255,12 +269,12 @@ describe("recordResult", () => { defaultConfig, ); - expect(result).toEqual({ ok: true, state: "closed" }); + expect(result).toEqual({ ok: true, state: "ok" }); expect(mockSendCommand).toHaveBeenCalledTimes(2); }); it("passes correct ep key for target", async () => { - mockSendCommand.mockResolvedValueOnce([1, "closed"]); + mockSendCommand.mockResolvedValueOnce([1, "ok"]); await recordResult(mockRedis, "my-target", 1, 0, defaultConfig); @@ -269,7 +283,7 @@ describe("recordResult", () => { }); it("passes consumedTokens and processingFailures as ARGV", async () => { - mockSendCommand.mockResolvedValueOnce([1, "closed"]); + mockSendCommand.mockResolvedValueOnce([1, "ok"]); await recordResult(mockRedis, "target-1", 8, 3, defaultConfig); diff --git a/lambdas/https-client-lambda/src/__tests__/handler.test.ts b/lambdas/https-client-lambda/src/__tests__/handler.test.ts index a2b7e8b4..f69d1d51 100644 --- a/lambdas/https-client-lambda/src/__tests__/handler.test.ts +++ b/lambdas/https-client-lambda/src/__tests__/handler.test.ts @@ -116,7 +116,7 @@ describe("processRecords", () => { consumedTokens: 100, effectiveRate: 10, }); - mockRecordResult.mockResolvedValue({ ok: true, state: "closed" }); + mockRecordResult.mockResolvedValue({ ok: true, state: "ok" }); }); it("returns no failures on successful delivery", async () => { @@ -515,7 +515,7 @@ describe("processRecords", () => { expect(recordCircuitBreakerOpen).not.toHaveBeenCalled(); }); - it("does not record CircuitBreakerOpen when recordResult returns closed", async () => { + it("does not record CircuitBreakerOpen when recordResult returns ok", async () => { const targetCb = { ...DEFAULT_TARGET, delivery: { circuitBreaker: { enabled: true } }, @@ -525,7 +525,7 @@ describe("processRecords", () => { outcome: "transient_failure", statusCode: 503, }); - mockRecordResult.mockResolvedValue({ ok: true, state: "closed" }); + mockRecordResult.mockResolvedValue({ ok: true, state: "ok" }); const { recordCircuitBreakerOpen } = jest.requireMock( "services/delivery-observability", @@ -536,6 +536,27 @@ describe("processRecords", () => { expect(recordCircuitBreakerOpen).not.toHaveBeenCalled(); }); + it("records CircuitBreakerClosed when recordResult returns closed", async () => { + const targetCb = { + ...DEFAULT_TARGET, + delivery: { circuitBreaker: { enabled: true } }, + }; + mockLoadTargetConfig.mockResolvedValue(targetCb); + mockDeliverPayload.mockResolvedValue({ + outcome: "success", + statusCode: 200, + }); + mockRecordResult.mockResolvedValue({ ok: true, state: "closed" }); + + const { recordCircuitBreakerClosed } = jest.requireMock( + "services/delivery-observability", + ); + + await processRecords([makeRecord()]); + + expect(recordCircuitBreakerClosed).toHaveBeenCalledWith("target-1"); + }); + it("records RateLimited on 429 response", async () => { mockDeliverPayload.mockResolvedValue({ outcome: "rate_limited", diff --git a/lambdas/https-client-lambda/src/__tests__/record-result-lua.test.ts b/lambdas/https-client-lambda/src/__tests__/record-result-lua.test.ts index 5cc407fe..48495de1 100644 --- a/lambdas/https-client-lambda/src/__tests__/record-result-lua.test.ts +++ b/lambdas/https-client-lambda/src/__tests__/record-result-lua.test.ts @@ -3,7 +3,7 @@ import { createRedisStore, evalLua } from "__tests__/helpers/lua-redis-mock"; // ARGV: [now, consumedTokens, processingFailures, cooldownPeriodMs, recoveryPeriodMs, failureThreshold, minAttempts, samplePeriodMs] // KEYS: [epKey] -// Returns: [ok (0|1), state] state: "closed" | "opened" | "failed" +// Returns: [ok (0|1), state] state: "ok" | "closed" | "opened" | "failed" type RecordResultArgs = { now: number; @@ -54,7 +54,7 @@ function runRecordResult( describe("record-result.lua", () => { describe("success recording", () => { - it("returns closed state for a successful batch", () => { + it("returns ok state for a successful batch with no state change", () => { const store = createRedisStore(); store.set("ep:t1", new Map([["sample_till", "9999999999"]])); @@ -64,7 +64,7 @@ describe("record-result.lua", () => { }); expect(ok).toBe(1); - expect(state).toBe("closed"); + expect(state).toBe("ok"); }); it("increments cur_attempts without incrementing cur_failures", () => { diff --git a/lambdas/https-client-lambda/src/services/endpoint-gate.ts b/lambdas/https-client-lambda/src/services/endpoint-gate.ts index bf9c1462..8a3b9089 100644 --- a/lambdas/https-client-lambda/src/services/endpoint-gate.ts +++ b/lambdas/https-client-lambda/src/services/endpoint-gate.ts @@ -19,7 +19,7 @@ export type AdmitResultDenied = { export type AdmitResult = AdmitResultAllowed | AdmitResultDenied; export type RecordResultOutcome = - | { ok: true; state: "closed" } + | { ok: true; state: "closed" | "ok" } | { ok: false; state: "opened" | "failed" }; export type EndpointGateConfig = { @@ -159,7 +159,7 @@ export async function recordResult( const [ok, state] = raw; if (ok === 1) { - return { ok: true, state: "closed" }; + return { ok: true, state: state as "closed" | "ok" }; } return { ok: false, state: state as "opened" | "failed" }; diff --git a/lambdas/https-client-lambda/src/services/record-result.lua b/lambdas/https-client-lambda/src/services/record-result.lua index fa3b1b12..c8c6a0a6 100644 --- a/lambdas/https-client-lambda/src/services/record-result.lua +++ b/lambdas/https-client-lambda/src/services/record-result.lua @@ -14,6 +14,7 @@ local OPENED = "opened" local CLOSED = "closed" local FAILED = "failed" +local OK = "ok" -- Keys local epKey = KEYS[1] -- ep:{targetId} combined endpoint state hash @@ -147,4 +148,4 @@ if isOpen or processingFailures > 0 then return { 0, FAILED } end -return { 1, CLOSED } +return { 1, OK }