From 9856bc97a364c83bbea43f4e0317eda647f80029 Mon Sep 17 00:00:00 2001 From: Rhys Cox Date: Thu, 23 Apr 2026 10:12:50 +0100 Subject: [PATCH 1/9] CCM-16073 - Updated rate limiting behaviour --- .../callbacks/module_perf_runner_lambda.tf | 52 +- .../src/__tests__/admit-lua.test.ts | 528 +++++++++--------- .../src/__tests__/endpoint-gate.test.ts | 132 +++-- .../src/__tests__/handler.test.ts | 178 ++++-- .../src/__tests__/record-result-lua.test.ts | 458 +++++++-------- lambdas/https-client-lambda/src/handler.ts | 350 ++++++------ .../src/services/admit.lua | 229 ++------ .../src/services/endpoint-gate.ts | 59 +- .../src/services/record-result.lua | 242 ++++---- lambdas/perf-runner-lambda/package.json | 6 +- .../src/__tests__/cloudwatch.test.ts | 377 ++++++++++++- .../src/__tests__/elasticache.test.ts | 74 +++ .../src/__tests__/index.test.ts | 48 ++ .../src/__tests__/purge.test.ts | 116 ++++ .../src/__tests__/runner.test.ts | 320 ++++++++++- .../src/__tests__/webhook-verify.test.ts | 173 ++++++ lambdas/perf-runner-lambda/src/cloudwatch.ts | 120 +++- lambdas/perf-runner-lambda/src/elasticache.ts | 52 ++ lambdas/perf-runner-lambda/src/index.ts | 23 +- lambdas/perf-runner-lambda/src/purge.ts | 40 ++ lambdas/perf-runner-lambda/src/runner.ts | 134 ++++- lambdas/perf-runner-lambda/src/types.ts | 40 ++ .../perf-runner-lambda/src/webhook-verify.ts | 59 ++ pnpm-lock.yaml | 18 +- 24 files changed, 2724 insertions(+), 1104 deletions(-) create mode 100644 lambdas/perf-runner-lambda/src/__tests__/elasticache.test.ts create mode 100644 lambdas/perf-runner-lambda/src/__tests__/purge.test.ts create mode 100644 lambdas/perf-runner-lambda/src/__tests__/webhook-verify.test.ts create mode 100644 lambdas/perf-runner-lambda/src/elasticache.ts create mode 100644 lambdas/perf-runner-lambda/src/purge.ts create mode 100644 lambdas/perf-runner-lambda/src/webhook-verify.ts diff --git a/infrastructure/terraform/components/callbacks/module_perf_runner_lambda.tf b/infrastructure/terraform/components/callbacks/module_perf_runner_lambda.tf index 424294a8..f3f57981 100644 --- a/infrastructure/terraform/components/callbacks/module_perf_runner_lambda.tf +++ b/infrastructure/terraform/components/callbacks/module_perf_runner_lambda.tf @@ -40,6 +40,15 @@ module "perf_runner_lambda" { INBOUND_QUEUE_URL = module.sqs_inbound_event.sqs_queue_url TRANSFORM_FILTER_LOG_GROUP = module.client_transform_filter_lambda.cloudwatch_log_group_name DELIVERY_LOG_GROUP_PREFIX = "/aws/lambda/${local.csi}-https-client-" + MOCK_WEBHOOK_LOG_GROUP = var.deploy_mock_clients ? module.mock_webhook_lambda[0].cloudwatch_log_group_name : "" + ELASTICACHE_ENDPOINT = aws_elasticache_serverless_cache.delivery_state.endpoint[0].address + ELASTICACHE_CACHE_NAME = aws_elasticache_serverless_cache.delivery_state.name + ELASTICACHE_IAM_USERNAME = "${var.project}-${var.environment}-${var.component}-elasticache-user" + } + + vpc_config = { + subnet_ids = try(local.acct.private_subnets[local.bc_name], []) + security_group_ids = [aws_security_group.https_client_lambda.id] } } @@ -74,6 +83,22 @@ data "aws_iam_policy_document" "perf_runner_lambda" { ] } + statement { + sid = "SQSPurgeQueue" + effect = "Allow" + + actions = [ + "sqs:PurgeQueue", + ] + + resources = [ + module.sqs_inbound_event.sqs_queue_arn, + "${module.sqs_inbound_event.sqs_queue_arn}-dlq", + "arn:aws:sqs:${var.region}:${var.aws_account_id}:${local.csi}-*-delivery-queue", + "arn:aws:sqs:${var.region}:${var.aws_account_id}:${local.csi}-*-delivery-dlq-queue", + ] + } + statement { sid = "CloudWatchLogsInsightsQuery" effect = "Allow" @@ -83,10 +108,15 @@ data "aws_iam_policy_document" "perf_runner_lambda" { "logs:StopQuery", ] - resources = [ - "arn:aws:logs:${var.region}:${var.aws_account_id}:log-group:${module.client_transform_filter_lambda.cloudwatch_log_group_name}:*", - "arn:aws:logs:${var.region}:${var.aws_account_id}:log-group:/aws/lambda/${local.csi}-https-client-*", - ] + resources = concat( + [ + "arn:aws:logs:${var.region}:${var.aws_account_id}:log-group:${module.client_transform_filter_lambda.cloudwatch_log_group_name}:*", + "arn:aws:logs:${var.region}:${var.aws_account_id}:log-group:/aws/lambda/${local.csi}-https-client-*", + ], + var.deploy_mock_clients ? [ + "arn:aws:logs:${var.region}:${var.aws_account_id}:log-group:${module.mock_webhook_lambda[0].cloudwatch_log_group_name}:*", + ] : [], + ) } statement { @@ -99,4 +129,18 @@ data "aws_iam_policy_document" "perf_runner_lambda" { resources = ["*"] } + + statement { + sid = "ElastiCacheConnect" + effect = "Allow" + + actions = [ + "elasticache:Connect", + ] + + resources = [ + aws_elasticache_serverless_cache.delivery_state.arn, + aws_elasticache_user.delivery_state_iam.arn, + ] + } } diff --git a/lambdas/https-client-lambda/src/__tests__/admit-lua.test.ts b/lambdas/https-client-lambda/src/__tests__/admit-lua.test.ts index 6aab4727..f4906cf2 100644 --- a/lambdas/https-client-lambda/src/__tests__/admit-lua.test.ts +++ b/lambdas/https-client-lambda/src/__tests__/admit-lua.test.ts @@ -1,32 +1,32 @@ import admitLuaSrc from "services/admit.lua"; import { createRedisStore, evalLua } from "__tests__/helpers/lua-redis-mock"; -// ARGV: [now, capacity, refillPerSec, cooldownMs, decayPeriodMs, cbWindowPeriodMs, cbProbeIntervalMs] -// KEYS: [cbKey, rlKey] -// Returns: [allowed (0|1), reason, retryAfterMs, effectiveRate] +// ARGV: [now, capacity, targetRateLimit, cooldownMs, recoveryPeriodMs, probeRateLimit, targetBatchSize] +// KEYS: [epKey] +// Returns: [consumedTokens, reason, retryAfterMs, effectiveRate] type AdmitArgs = { now: number; capacity: number; - refillPerSec: number; + targetRateLimit: number; cooldownMs: number; - decayPeriodMs: number; - cbWindowPeriodMs: number; - cbProbeIntervalMs: number; + recoveryPeriodMs: number; + probeRateLimit: number; + targetBatchSize: number; }; const defaultArgs: AdmitArgs = { now: 1_000_000, - capacity: 10, - refillPerSec: 10, - cooldownMs: 60_000, - decayPeriodMs: 300_000, - cbWindowPeriodMs: 60_000, - cbProbeIntervalMs: 60_000, + capacity: 2250, + targetRateLimit: 10, + cooldownMs: 120_000, + recoveryPeriodMs: 600_000, + probeRateLimit: 1 / 60, + targetBatchSize: 1, }; type AdmitResult = { - allowed: number; + consumedTokens: number; reason: string; retryAfterMs: number; effectiveRate: number; @@ -40,20 +40,20 @@ function runAdmit( const merged = { ...defaultArgs, ...args }; const raw = evalLua( admitLuaSrc, - [`cb:${targetId}`, `rl:${targetId}`], + [`ep:${targetId}`], [ merged.now.toString(), merged.capacity.toString(), - merged.refillPerSec.toString(), + merged.targetRateLimit.toString(), merged.cooldownMs.toString(), - merged.decayPeriodMs.toString(), - merged.cbWindowPeriodMs.toString(), - merged.cbProbeIntervalMs.toString(), + merged.recoveryPeriodMs.toString(), + merged.probeRateLimit.toString(), + merged.targetBatchSize.toString(), ], store, ) as [number, string, number, number]; return { - allowed: raw[0], + consumedTokens: raw[0], reason: raw[1], retryAfterMs: raw[2], effectiveRate: raw[3], @@ -62,399 +62,391 @@ function runAdmit( describe("admit.lua", () => { describe("rate limiting", () => { - it("allows the first request with full token bucket", () => { - const store = createRedisStore(); - const { allowed, effectiveRate, reason, retryAfterMs } = runAdmit(store); - - expect(allowed).toBe(1); - expect(reason).toBe("allowed"); - expect(retryAfterMs).toBe(0); - expect(effectiveRate).toBe(10); - }); - - it("depletes tokens on consecutive calls and rejects when empty", () => { + it("enters recovery ramp-up on a fresh endpoint with no prior state", () => { const store = createRedisStore(); + const now = 1_000_000; - for (let i = 0; i < 10; i++) { - const { allowed } = runAdmit(store); - expect(allowed).toBe(1); - } + const { consumedTokens, effectiveRate, reason } = runAdmit(store, { + now, + targetRateLimit: 10, + }); - const { allowed, reason } = runAdmit(store); - expect(allowed).toBe(0); + expect(consumedTokens).toBe(0); expect(reason).toBe("rate_limited"); + expect(effectiveRate).toBe(0); }); - it("returns retryAfterMs when rate limited", () => { + it("persists switched_at on first contact so recovery ramp progresses", () => { const store = createRedisStore(); + const now = 1_000_000; - for (let i = 0; i < 10; i++) { - runAdmit(store); - } + runAdmit(store, { now, targetRateLimit: 10 }); - const { retryAfterMs } = runAdmit(store); - expect(retryAfterMs).toBe(1000); + const epHash = store.get("ep:t1")!; + expect(epHash.get("switched_at")).toBe(now.toString()); }); - it("reports effective rate when rate limited", () => { + it("ramps up rate on subsequent calls after fresh endpoint initialisation", () => { const store = createRedisStore(); + const now = 1_000_000; + const later = now + 60_000; - for (let i = 0; i < 10; i++) { - runAdmit(store); - } + runAdmit(store, { now, targetRateLimit: 10 }); - const { effectiveRate } = runAdmit(store); - expect(effectiveRate).toBe(10); + const { consumedTokens, reason } = runAdmit(store, { + now: later, + targetRateLimit: 10, + }); + + expect(consumedTokens).toBeGreaterThanOrEqual(1); + expect(reason).toBe("allowed"); }); - it("refills tokens over time", () => { + it("allows a single request when bucket has tokens from refill", () => { const store = createRedisStore(); const now = 1_000_000; + store.set( + "ep:t1", + new Map([ + ["bucket_tokens", "0"], + ["bucket_refilled_at", "0"], + ["switched_at", "0"], + ]), + ); - for (let i = 0; i < 10; i++) { - runAdmit(store, { now }); - } - - const denied = runAdmit(store, { now }); - expect(denied.allowed).toBe(0); + const { consumedTokens, reason, retryAfterMs } = runAdmit(store, { + now, + targetRateLimit: 10, + }); - const refilled = runAdmit(store, { now: now + 1000 }); - expect(refilled.allowed).toBe(1); + expect(consumedTokens).toBe(1); + expect(reason).toBe("allowed"); + expect(retryAfterMs).toBe(0); }); - it("caps tokens at capacity", () => { + it("consumes up to targetBatchSize tokens", () => { const store = createRedisStore(); const now = 1_000_000; + store.set( + "ep:t1", + new Map([ + ["bucket_tokens", "5"], + ["bucket_refilled_at", now.toString()], + ["switched_at", "0"], + ]), + ); - runAdmit(store, { now, capacity: 5, refillPerSec: 100 }); - - // Advance 10 seconds — would add 1000 tokens without cap - runAdmit(store, { now: now + 10_000, capacity: 5, refillPerSec: 100 }); - - const rlHash = store.get("rl:t1")!; - // Refill capped to capacity (5), then one consumed → 4 - expect(Number(rlHash.get("tokens"))).toBe(4); + const { consumedTokens } = runAdmit(store, { + now, + targetBatchSize: 3, + }); + expect(consumedTokens).toBe(3); }); - it("handles zero refill rate", () => { + it("consumes all available when batch exceeds available tokens", () => { const store = createRedisStore(); + const now = 1_000_000; + store.set( + "ep:t1", + new Map([ + ["bucket_tokens", "2"], + ["bucket_refilled_at", now.toString()], + ["switched_at", "0"], + ]), + ); - for (let i = 0; i < 10; i++) { - runAdmit(store, { refillPerSec: 0 }); - } - - const { allowed, reason, retryAfterMs } = runAdmit(store, { - refillPerSec: 0, + const { consumedTokens } = runAdmit(store, { + now, + targetBatchSize: 5, }); - expect(allowed).toBe(0); - expect(reason).toBe("rate_limited"); - expect(retryAfterMs).toBe(1000); + expect(consumedTokens).toBe(2); }); - }); - describe("circuit breaker", () => { - it("rejects when circuit is open", () => { + it("returns rate_limited when no tokens available", () => { const store = createRedisStore(); const now = 1_000_000; - const openedUntil = now + 60_000; - store.set( - "cb:t1", + "ep:t1", new Map([ - ["opened_until_ms", openedUntil.toString()], - ["last_probe_ms", now.toString()], + ["bucket_tokens", "0"], + ["bucket_refilled_at", now.toString()], + ["switched_at", "0"], ]), ); - const { allowed, effectiveRate, reason } = runAdmit(store, { now }); - expect(allowed).toBe(0); - expect(reason).toBe("circuit_open"); - expect(effectiveRate).toBe(0); + const { consumedTokens, reason, retryAfterMs } = runAdmit(store, { now }); + expect(consumedTokens).toBe(0); + expect(reason).toBe("rate_limited"); + expect(retryAfterMs).toBe(1000); }); - it("returns retryAfterMs for open circuit", () => { + it("refills tokens over time", () => { const store = createRedisStore(); const now = 1_000_000; - const openedUntil = now + 30_000; - store.set( - "cb:t1", + "ep:t1", new Map([ - ["opened_until_ms", openedUntil.toString()], - ["last_probe_ms", now.toString()], + ["bucket_tokens", "0"], + ["bucket_refilled_at", now.toString()], + ["switched_at", "0"], ]), ); - const { retryAfterMs } = runAdmit(store, { now }); - expect(retryAfterMs).toBe(30_000); + const { consumedTokens } = runAdmit(store, { + now: now + 1000, + targetRateLimit: 10, + }); + expect(consumedTokens).toBe(1); }); - it("allows probe when probe interval has elapsed", () => { + it("caps tokens at capacity", () => { const store = createRedisStore(); const now = 1_000_000; - const openedUntil = now + 120_000; - const lastProbe = now - 61_000; - store.set( - "cb:t1", + "ep:t1", new Map([ - ["opened_until_ms", openedUntil.toString()], - ["last_probe_ms", lastProbe.toString()], + ["bucket_tokens", "0"], + ["bucket_refilled_at", "0"], + ["switched_at", "0"], ]), ); - const { allowed, effectiveRate, reason, retryAfterMs } = runAdmit(store, { + const { consumedTokens } = runAdmit(store, { now, - cbProbeIntervalMs: 60_000, + capacity: 5, + targetRateLimit: 100, + targetBatchSize: 10, }); - expect(allowed).toBe(1); - expect(reason).toBe("probe"); - expect(retryAfterMs).toBe(0); - expect(effectiveRate).toBe(0); + expect(consumedTokens).toBe(5); }); - it("updates last_probe_ms after allowing a probe", () => { + it("handles zero refill rate", () => { const store = createRedisStore(); const now = 1_000_000; - const openedUntil = now + 120_000; - const lastProbe = now - 61_000; - store.set( - "cb:t1", + "ep:t1", new Map([ - ["opened_until_ms", openedUntil.toString()], - ["last_probe_ms", lastProbe.toString()], + ["bucket_tokens", "0"], + ["bucket_refilled_at", now.toString()], + ["switched_at", "0"], ]), ); - runAdmit(store, { now, cbProbeIntervalMs: 60_000 }); - - const cbHash = store.get("cb:t1")!; - expect(cbHash.get("last_probe_ms")).toBe(now.toString()); + const { consumedTokens, reason } = runAdmit(store, { + now: now + 10_000, + targetRateLimit: 0, + }); + expect(consumedTokens).toBe(0); + expect(reason).toBe("rate_limited"); }); - it("does not probe when interval has not elapsed", () => { + it("preserves fractional refill time (bucketRefilledAt += generationTime, not now)", () => { const store = createRedisStore(); const now = 1_000_000; - const openedUntil = now + 120_000; - const lastProbe = now - 30_000; - store.set( - "cb:t1", + "ep:t1", new Map([ - ["opened_until_ms", openedUntil.toString()], - ["last_probe_ms", lastProbe.toString()], + ["bucket_tokens", "0"], + ["bucket_refilled_at", (now - 150).toString()], + ["switched_at", "0"], ]), ); - const { allowed, reason } = runAdmit(store, { - now, - cbProbeIntervalMs: 60_000, - }); - expect(allowed).toBe(0); - expect(reason).toBe("circuit_open"); + runAdmit(store, { now, targetRateLimit: 10 }); + + const epHash = store.get("ep:t1")!; + const refilledAt = Number(epHash.get("bucket_refilled_at")); + // 1 token generated at rate 10/s takes 100ms, so refilledAt = (now-150) + 100 = now - 50 + expect(refilledAt).toBe(now - 50); }); + }); - it("does not probe when cbProbeIntervalMs is 0", () => { + describe("circuit breaker states", () => { + it("blocks completely when circuit is open during cooldown", () => { const store = createRedisStore(); const now = 1_000_000; - const openedUntil = now + 120_000; + const switchedAt = now - 10_000; store.set( - "cb:t1", + "ep:t1", new Map([ - ["opened_until_ms", openedUntil.toString()], - ["last_probe_ms", "0"], + ["is_open", "1"], + ["switched_at", switchedAt.toString()], + ["bucket_tokens", "100"], ]), ); - const { allowed, reason } = runAdmit(store, { + const { consumedTokens, reason } = runAdmit(store, { now, - cbProbeIntervalMs: 0, + cooldownMs: 120_000, }); - expect(allowed).toBe(0); + expect(consumedTokens).toBe(0); expect(reason).toBe("circuit_open"); }); - }); - describe("sliding window", () => { - it("initialises cbWindowFrom on first call", () => { + it("does not consume bucket tokens when fully open", () => { const store = createRedisStore(); const now = 1_000_000; - - runAdmit(store, { now }); - - const cbHash = store.get("cb:t1")!; - expect(cbHash.get("cb_window_from")).toBe(now.toString()); - }); - - it("rolls current window to previous when period expires", () => { - const store = createRedisStore(); - const cbWindowPeriodMs = 60_000; - const t0 = 1_000_000; - const t1 = t0 + cbWindowPeriodMs + 1; + const switchedAt = now - 10_000; store.set( - "cb:t1", + "ep:t1", new Map([ - ["cb_window_from", t0.toString()], - ["cb_failures", "5"], - ["cb_attempts", "10"], - ["cb_prev_failures", "0"], - ["cb_prev_attempts", "0"], + ["is_open", "1"], + ["switched_at", switchedAt.toString()], + ["bucket_tokens", "100"], + ["bucket_refilled_at", now.toString()], ]), ); - runAdmit(store, { now: t1, cbWindowPeriodMs }); + runAdmit(store, { now, cooldownMs: 120_000 }); - const cbHash = store.get("cb:t1")!; - expect(cbHash.get("cb_prev_failures")).toBe("5"); - expect(cbHash.get("cb_prev_attempts")).toBe("10"); - expect(cbHash.get("cb_failures")).toBe("0"); - expect(cbHash.get("cb_attempts")).toBe("0"); - expect(cbHash.get("cb_window_from")).toBe(t1.toString()); + const epHash = store.get("ep:t1")!; + expect(Number(epHash.get("bucket_tokens"))).toBe(100); }); - it("clears both windows when gap exceeds two periods", () => { + it("returns retryAfterMs for open circuit", () => { const store = createRedisStore(); - const cbWindowPeriodMs = 60_000; - const t0 = 1_000_000; - const t1 = t0 + 2 * cbWindowPeriodMs + 1; + const now = 1_000_000; + const switchedAt = now - 10_000; store.set( - "cb:t1", + "ep:t1", new Map([ - ["cb_window_from", t0.toString()], - ["cb_failures", "5"], - ["cb_attempts", "10"], - ["cb_prev_failures", "3"], - ["cb_prev_attempts", "7"], + ["is_open", "1"], + ["switched_at", switchedAt.toString()], ]), ); - runAdmit(store, { now: t1, cbWindowPeriodMs }); - - const cbHash = store.get("cb:t1")!; - expect(cbHash.get("cb_prev_failures")).toBe("0"); - expect(cbHash.get("cb_prev_attempts")).toBe("0"); - expect(cbHash.get("cb_failures")).toBe("0"); - expect(cbHash.get("cb_attempts")).toBe("0"); - expect(cbHash.get("cb_window_from")).toBe(t1.toString()); + const { retryAfterMs } = runAdmit(store, { now, cooldownMs: 120_000 }); + expect(retryAfterMs).toBe(110_000); }); - }); - describe("decay scaling", () => { - it("applies reduced rate during decay period", () => { + it("uses probeRateLimit when half-open (after cooldown)", () => { const store = createRedisStore(); - const closedAt = 1_000_000; - const decayPeriodMs = 300_000; - const halfwayThrough = closedAt + decayPeriodMs / 2; + const now = 1_000_000; + const switchedAt = now - 130_000; - store.set("cb:t1", new Map([["opened_until_ms", closedAt.toString()]])); + store.set( + "ep:t1", + new Map([ + ["is_open", "1"], + ["switched_at", switchedAt.toString()], + ["bucket_tokens", "0"], + ["bucket_refilled_at", (now - 60_000).toString()], + ]), + ); const { effectiveRate } = runAdmit(store, { - now: halfwayThrough, - refillPerSec: 10, - decayPeriodMs, - }); - expect(effectiveRate).toBe(5); - }); - - it("uses full rate after decay period ends", () => { - const store = createRedisStore(); - const closedAt = 1_000_000; - const decayPeriodMs = 300_000; - const afterDecay = closedAt + decayPeriodMs + 1; - - store.set("cb:t1", new Map([["opened_until_ms", closedAt.toString()]])); - - const { allowed, effectiveRate } = runAdmit(store, { - now: afterDecay, - refillPerSec: 10, - decayPeriodMs, + now, + cooldownMs: 120_000, + probeRateLimit: 1 / 60, }); - expect(allowed).toBe(1); - expect(effectiveRate).toBe(10); + expect(effectiveRate).toBeCloseTo(1 / 60, 5); }); - it("clamps minimum effective rate to 1", () => { + it("uses recovery ramp when closed during recovery period", () => { const store = createRedisStore(); - const closedAt = 1_000_000; - const decayPeriodMs = 300_000; - const veryEarly = closedAt + 1; + const switchedAt = 1_000_000; + const recoveryPeriodMs = 600_000; + const now = switchedAt + recoveryPeriodMs / 2; - store.set("cb:t1", new Map([["opened_until_ms", closedAt.toString()]])); + store.set( + "ep:t1", + new Map([ + ["is_open", "0"], + ["switched_at", switchedAt.toString()], + ["bucket_tokens", "0"], + ["bucket_refilled_at", "0"], + ]), + ); const { effectiveRate } = runAdmit(store, { - now: veryEarly, - refillPerSec: 10, - decayPeriodMs, + now, + targetRateLimit: 10, + recoveryPeriodMs, }); - expect(effectiveRate).toBeGreaterThanOrEqual(1); - }); - - it("clears openedUntil when decay period fully elapses", () => { - const store = createRedisStore(); - const closedAt = 1_000_000; - const decayPeriodMs = 300_000; - const afterDecay = closedAt + decayPeriodMs + 1; - - store.set("cb:t1", new Map([["opened_until_ms", closedAt.toString()]])); - - runAdmit(store, { now: afterDecay, decayPeriodMs }); - - const cbHash = store.get("cb:t1")!; - expect(cbHash.get("opened_until_ms")).toBe("0"); + expect(effectiveRate).toBe(5); }); - it("does not decay when decayPeriodMs is 0", () => { + it("uses full rate when closed and past recovery period", () => { const store = createRedisStore(); - const closedAt = 1_000_000; + const switchedAt = 100_000; + const recoveryPeriodMs = 600_000; + const now = switchedAt + recoveryPeriodMs + 1; - store.set("cb:t1", new Map([["opened_until_ms", closedAt.toString()]])); + store.set( + "ep:t1", + new Map([ + ["is_open", "0"], + ["switched_at", switchedAt.toString()], + ["bucket_tokens", "0"], + ["bucket_refilled_at", "0"], + ]), + ); - const { allowed, effectiveRate } = runAdmit(store, { - now: closedAt + 1, - refillPerSec: 10, - decayPeriodMs: 0, + const { effectiveRate } = runAdmit(store, { + now, + targetRateLimit: 10, + recoveryPeriodMs, }); - expect(allowed).toBe(1); expect(effectiveRate).toBe(10); }); }); describe("state persistence", () => { - it("persists token count and last_refill_ms", () => { + it("persists bucket_tokens and bucket_refilled_at", () => { const store = createRedisStore(); - runAdmit(store, { now: 1_000_000, capacity: 5 }); + const now = 1_000_000; + store.set( + "ep:t1", + new Map([ + ["bucket_tokens", "5"], + ["bucket_refilled_at", now.toString()], + ["switched_at", "0"], + ]), + ); - const rlHash = store.get("rl:t1")!; - expect(rlHash.get("tokens")).toBeDefined(); - expect(rlHash.get("last_refill_ms")).toBe("1000000"); + runAdmit(store, { now, targetBatchSize: 2 }); + + const epHash = store.get("ep:t1")!; + expect(Number(epHash.get("bucket_tokens"))).toBe(3); }); - it("persists circuit breaker fields", () => { + it("does not write sampling or circuit fields", () => { const store = createRedisStore(); - runAdmit(store, { now: 1_000_000 }); - - const cbHash = store.get("cb:t1")!; - expect(cbHash.has("opened_until_ms")).toBe(true); - expect(cbHash.has("cb_window_from")).toBe(true); - expect(cbHash.has("cb_failures")).toBe(true); - expect(cbHash.has("cb_attempts")).toBe(true); - expect(cbHash.has("cb_prev_failures")).toBe(true); - expect(cbHash.has("cb_prev_attempts")).toBe(true); + runAdmit(store, { + now: 10_000, + }); + + const epHash = store.get("ep:t1")!; + expect(epHash.has("cur_attempts")).toBe(false); + expect(epHash.has("cur_failures")).toBe(false); + expect(epHash.has("sample_till")).toBe(false); }); it("isolates state between targets", () => { const store = createRedisStore(); - runAdmit(store, {}, "target-a"); - runAdmit(store, {}, "target-b"); + store.set( + "ep:target-a", + new Map([ + ["bucket_tokens", "5"], + ["bucket_refilled_at", "10000"], + ]), + ); + store.set( + "ep:target-b", + new Map([ + ["bucket_tokens", "3"], + ["bucket_refilled_at", "10000"], + ]), + ); + + runAdmit(store, { now: 10_000 }, "target-a"); + runAdmit(store, { now: 10_000 }, "target-b"); - expect(store.has("cb:target-a")).toBe(true); - expect(store.has("cb:target-b")).toBe(true); - expect(store.has("rl:target-a")).toBe(true); - expect(store.has("rl:target-b")).toBe(true); + expect(store.has("ep:target-a")).toBe(true); + expect(store.has("ep:target-b")).toBe(true); }); }); }); diff --git a/lambdas/https-client-lambda/src/__tests__/endpoint-gate.test.ts b/lambdas/https-client-lambda/src/__tests__/endpoint-gate.test.ts index efbc6d88..2cc8cc31 100644 --- a/lambdas/https-client-lambda/src/__tests__/endpoint-gate.test.ts +++ b/lambdas/https-client-lambda/src/__tests__/endpoint-gate.test.ts @@ -11,13 +11,13 @@ const mockDisconnect = jest.fn().mockResolvedValue(undefined); const mockOn = jest.fn(); const defaultConfig: EndpointGateConfig = { - burstCapacity: 10, - cbProbeIntervalMs: 60_000, - decayPeriodMs: 300_000, - cbWindowPeriodMs: 60_000, - cbErrorThreshold: 0.5, - cbMinAttempts: 10, - cbCooldownMs: 60_000, + burstCapacity: 2250, + probeRateLimit: 1 / 60, + recoveryPeriodMs: 600_000, + samplePeriodMs: 300_000, + failureThreshold: 0.3, + minAttempts: 5, + cooldownPeriodMs: 120_000, }; const mockRedis = { @@ -34,12 +34,23 @@ beforeEach(() => { }); describe("admit", () => { - it("returns allowed when tokens available", async () => { - mockSendCommand.mockResolvedValueOnce([1, "allowed", 0, 10]); + it("returns allowed with consumedTokens when tokens available", async () => { + mockSendCommand.mockResolvedValueOnce([5, "allowed", 0, 10]); - const result = await admit(mockRedis, "target-1", 10, true, defaultConfig); + const result = await admit( + mockRedis, + "target-1", + 10, + true, + 5, + defaultConfig, + ); - expect(result).toEqual({ allowed: true, probe: false, effectiveRate: 10 }); + expect(result).toEqual({ + allowed: true, + consumedTokens: 5, + effectiveRate: 10, + }); expect(mockSendCommand).toHaveBeenCalledWith( expect.arrayContaining(["EVALSHA"]), ); @@ -48,7 +59,14 @@ describe("admit", () => { it("returns rate_limited when tokens exhausted", async () => { mockSendCommand.mockResolvedValueOnce([0, "rate_limited", 1000, 10]); - const result = await admit(mockRedis, "target-1", 10, false, defaultConfig); + const result = await admit( + mockRedis, + "target-1", + 10, + false, + 5, + defaultConfig, + ); expect(result).toEqual({ allowed: false, @@ -58,18 +76,17 @@ describe("admit", () => { }); }); - it("returns allowed with probe flag when circuit is open but probe slot is available", async () => { - mockSendCommand.mockResolvedValueOnce([1, "probe", 0, 0]); - - const result = await admit(mockRedis, "target-1", 10, true, defaultConfig); - - expect(result).toEqual({ allowed: true, probe: true, effectiveRate: 0 }); - }); - - it("returns circuit_open without probe slot", async () => { + it("returns circuit_open when circuit is fully open", async () => { mockSendCommand.mockResolvedValueOnce([0, "circuit_open", 30_000, 0]); - const result = await admit(mockRedis, "target-1", 10, true, defaultConfig); + const result = await admit( + mockRedis, + "target-1", + 10, + true, + 5, + defaultConfig, + ); expect(result).toEqual({ allowed: false, @@ -84,9 +101,20 @@ describe("admit", () => { .mockRejectedValueOnce(new Error("NOSCRIPT No matching script")) .mockResolvedValueOnce([1, "allowed", 0, 10]); - const result = await admit(mockRedis, "target-1", 10, true, defaultConfig); + const result = await admit( + mockRedis, + "target-1", + 10, + true, + 1, + defaultConfig, + ); - expect(result).toEqual({ allowed: true, probe: false, effectiveRate: 10 }); + expect(result).toEqual({ + allowed: true, + consumedTokens: 1, + effectiveRate: 10, + }); expect(mockSendCommand).toHaveBeenCalledTimes(2); expect(mockSendCommand).toHaveBeenNthCalledWith( 1, @@ -98,25 +126,33 @@ describe("admit", () => { ); }); - it("passes cbProbeIntervalMs=0 when circuit breaker is disabled", async () => { + it("passes probeRateLimit=0 when circuit breaker is disabled", async () => { mockSendCommand.mockResolvedValueOnce([1, "allowed", 0, 10]); - await admit(mockRedis, "target-1", 10, false, defaultConfig); + await admit(mockRedis, "target-1", 10, false, 1, defaultConfig); - // EVALSHA layout: [EVALSHA, sha, keyCount, cbKey, rlKey, now, capacity, refillPerSec, cooldownMs, decayPeriodMs, cbWindowPeriodMs, cbProbeIntervalMs] const args = mockSendCommand.mock.calls[0]![0] as string[]; - const cbProbeIntervalArg = args[11]; - expect(cbProbeIntervalArg).toBe("0"); + const probeRateArg = args[9]; + expect(probeRateArg).toBe("0"); }); - it("passes cbKey first, rlKey second", async () => { + it("passes single epKey", async () => { mockSendCommand.mockResolvedValueOnce([1, "allowed", 0, 5]); - await admit(mockRedis, "my-target", 5, true, defaultConfig); + await admit(mockRedis, "my-target", 5, true, 1, defaultConfig); const args = mockSendCommand.mock.calls[0]![0] as string[]; - expect(args[3]).toBe("cb:{my-target}"); - expect(args[4]).toBe("rl:{my-target}"); + expect(args[3]).toBe("ep:{my-target}"); + }); + + it("passes targetBatchSize as ARGV", async () => { + mockSendCommand.mockResolvedValueOnce([3, "allowed", 0, 10]); + + await admit(mockRedis, "target-1", 10, true, 7, defaultConfig); + + const args = mockSendCommand.mock.calls[0]![0] as string[]; + const batchSizeArg = args[10]; + expect(batchSizeArg).toBe("7"); }); }); @@ -130,6 +166,7 @@ describe("evalScript", () => { "target-1", 10, true, + 1, defaultConfig, ).catch((error: unknown) => error); @@ -149,6 +186,7 @@ describe("evalScript", () => { "target-1", 10, true, + 1, defaultConfig, ).catch((error: unknown) => error); @@ -165,7 +203,8 @@ describe("recordResult", () => { const result = await recordResult( mockRedis, "target-1", - true, + 5, + 0, defaultConfig, ); @@ -181,7 +220,8 @@ describe("recordResult", () => { const result = await recordResult( mockRedis, "target-1", - false, + 5, + 5, defaultConfig, ); @@ -194,7 +234,8 @@ describe("recordResult", () => { const result = await recordResult( mockRedis, "target-1", - false, + 5, + 1, defaultConfig, ); @@ -209,7 +250,8 @@ describe("recordResult", () => { const result = await recordResult( mockRedis, "target-1", - true, + 1, + 0, defaultConfig, ); @@ -217,12 +259,22 @@ describe("recordResult", () => { expect(mockSendCommand).toHaveBeenCalledTimes(2); }); - it("passes correct cb key for target", async () => { + it("passes correct ep key for target", async () => { + mockSendCommand.mockResolvedValueOnce([1, "closed"]); + + await recordResult(mockRedis, "my-target", 1, 0, defaultConfig); + + const args = mockSendCommand.mock.calls[0]![0] as string[]; + expect(args[3]).toBe("ep:{my-target}"); + }); + + it("passes consumedTokens and processingFailures as ARGV", async () => { mockSendCommand.mockResolvedValueOnce([1, "closed"]); - await recordResult(mockRedis, "my-target", true, defaultConfig); + await recordResult(mockRedis, "target-1", 8, 3, defaultConfig); const args = mockSendCommand.mock.calls[0]![0] as string[]; - expect(args[3]).toBe("cb:{my-target}"); + expect(args[5]).toBe("8"); + expect(args[6]).toBe("3"); }); }); diff --git a/lambdas/https-client-lambda/src/__tests__/handler.test.ts b/lambdas/https-client-lambda/src/__tests__/handler.test.ts index 3b8ad521..a2b7e8b4 100644 --- a/lambdas/https-client-lambda/src/__tests__/handler.test.ts +++ b/lambdas/https-client-lambda/src/__tests__/handler.test.ts @@ -3,7 +3,6 @@ import { DEFAULT_TARGET, makeRecord, } from "__tests__/fixtures/handler-fixtures"; -import { VisibilityManagedError } from "services/visibility-managed-error"; jest.mock("@nhs-notify-client-callbacks/logger", () => ({ logger: { @@ -74,17 +73,20 @@ jest.mock("services/redis-client", () => ({ getRedisClient: (...args: unknown[]) => mockGetRedisClient(...args), })); +jest.mock("services/delivery-observability", () => ({ + recordAdmissionDenied: jest.fn(), + recordCircuitBreakerClosed: jest.fn(), + recordCircuitBreakerOpen: jest.fn(), + recordDeliveryAttempt: jest.fn(), + recordDeliveryDuration: jest.fn(), + recordDeliveryFailure: jest.fn(), + recordDeliveryPermanentFailure: jest.fn(), + recordDeliveryRateLimited: jest.fn(), + recordDeliverySuccess: jest.fn(), + recordRetryWindowExhausted: jest.fn(), +})); + jest.mock("services/delivery-metrics", () => ({ - emitAdmissionDenied: jest.fn(), - emitCircuitBreakerClosed: jest.fn(), - emitCircuitBreakerOpen: jest.fn(), - emitDeliveryAttempt: jest.fn(), - emitDeliveryDuration: jest.fn(), - emitDeliveryFailure: jest.fn(), - emitDeliveryPermanentFailure: jest.fn(), - emitDeliverySuccess: jest.fn(), - emitRateLimited: jest.fn(), - emitRetryWindowExhausted: jest.fn(), flushMetrics: jest.fn().mockResolvedValue(undefined), resetMetrics: jest.fn(), })); @@ -106,12 +108,12 @@ describe("processRecords", () => { mockJitteredBackoff.mockReturnValue(5); mockIsWindowExhausted.mockReturnValue(false); mockHandleRateLimitedRecord.mockRejectedValue( - new VisibilityManagedError("Rate limited — requeue"), + new Error("Rate limited — requeue"), ); mockGetRedisClient.mockResolvedValue({}); mockAdmit.mockResolvedValue({ allowed: true, - probe: false, + consumedTokens: 100, effectiveRate: 10, }); mockRecordResult.mockResolvedValue({ ok: true, state: "closed" }); @@ -159,7 +161,7 @@ describe("processRecords", () => { expect(failures).toEqual([{ itemIdentifier: "msg-1" }]); }); - it("returns failure for 429 rate-limited responses", async () => { + it("returns failure for 429 when handleRateLimitedRecord rejects", async () => { mockDeliverPayload.mockResolvedValue({ outcome: "rate_limited", retryAfterHeader: "60", @@ -177,7 +179,7 @@ describe("processRecords", () => { ); }); - it("processes multiple records independently", async () => { + it("processes multiple records in a single target batch", async () => { const record1 = makeRecord({ messageId: "msg-1" }); const record2 = makeRecord({ messageId: "msg-2" }); @@ -191,25 +193,45 @@ describe("processRecords", () => { const failures = await processRecords([record1, record2]); expect(failures).toEqual([{ itemIdentifier: "msg-2" }]); + expect(mockAdmit).toHaveBeenCalledTimes(1); + }); + + it("delivers only admitted records when consumedTokens is less than batch size", async () => { + const record1 = makeRecord({ messageId: "msg-1" }); + const record2 = makeRecord({ messageId: "msg-2" }); + const record3 = makeRecord({ messageId: "msg-3" }); + + mockAdmit.mockResolvedValue({ + allowed: true, + consumedTokens: 1, + effectiveRate: 10, + }); + + const failures = await processRecords([record1, record2, record3]); + + expect(mockDeliverPayload).toHaveBeenCalledTimes(1); + expect(failures).toEqual([ + { itemIdentifier: "msg-2" }, + { itemIdentifier: "msg-3" }, + ]); }); - it("an unexpected error on one record does not prevent subsequent records being processed", async () => { + it("an unexpected delivery error does not prevent other records in the batch", async () => { const record1 = makeRecord({ messageId: "msg-1" }); const record2 = makeRecord({ messageId: "msg-2" }); - mockLoadTargetConfig - .mockRejectedValueOnce(new Error("S3 unavailable")) - .mockResolvedValueOnce(DEFAULT_TARGET); + mockDeliverPayload + .mockRejectedValueOnce(new Error("Connection reset")) + .mockResolvedValueOnce({ outcome: "success" }); const failures = await processRecords([record1, record2]); expect(failures).toEqual([{ itemIdentifier: "msg-1" }]); - expect(mockDeliverPayload).toHaveBeenCalledTimes(1); expect(mockChangeVisibility).toHaveBeenCalledWith("receipt-1", 5); }); it("applies jittered backoff cooldown on unexpected errors", async () => { - mockLoadTargetConfig.mockRejectedValue(new Error("Infrastructure error")); + mockDeliverPayload.mockRejectedValue(new Error("Infrastructure error")); const failures = await processRecords([makeRecord()]); @@ -217,7 +239,7 @@ describe("processRecords", () => { expect(mockChangeVisibility).toHaveBeenCalledWith("receipt-1", 5); }); - it("does not apply a second visibility change for admission-denied (managed path)", async () => { + it("changes visibility once per record for admission-denied batch", async () => { mockAdmit.mockResolvedValue({ allowed: false, reason: "rate_limited", @@ -230,7 +252,7 @@ describe("processRecords", () => { expect(mockChangeVisibility).toHaveBeenCalledTimes(1); }); - it("does not apply a second visibility change for transient failure (managed path)", async () => { + it("changes visibility once for transient failure", async () => { mockDeliverPayload.mockResolvedValue({ outcome: "transient_failure", statusCode: 503, @@ -241,13 +263,13 @@ describe("processRecords", () => { expect(mockChangeVisibility).toHaveBeenCalledTimes(1); }); - it("returns failure when CLIENT_ID is not set", async () => { + it("throws when CLIENT_ID is not set", async () => { const saved = process.env.CLIENT_ID; delete process.env.CLIENT_ID; - const failures = await processRecords([makeRecord()]); - - expect(failures).toEqual([{ itemIdentifier: "msg-1" }]); + await expect(processRecords([makeRecord()])).rejects.toThrow( + "CLIENT_ID is required", + ); process.env.CLIENT_ID = saved; }); @@ -262,7 +284,7 @@ describe("processRecords", () => { expect(mockDeliverPayload).not.toHaveBeenCalled(); }); - it("calls changeVisibility with backoff on 5xx then throws", async () => { + it("calls changeVisibility with backoff on 5xx", async () => { mockDeliverPayload.mockResolvedValue({ outcome: "transient_failure", statusCode: 503, @@ -303,7 +325,7 @@ describe("processRecords", () => { expect(failures).toEqual([]); }); - it("requeues when rate limited by endpoint gate", async () => { + it("requeues all records when rate limited by endpoint gate", async () => { mockAdmit.mockResolvedValue({ allowed: false, reason: "rate_limited", @@ -319,7 +341,7 @@ describe("processRecords", () => { expect(mockDeliverPayload).not.toHaveBeenCalled(); }); - it("requeues when circuit is open", async () => { + it("requeues all records when circuit is open", async () => { mockAdmit.mockResolvedValue({ allowed: false, reason: "circuit_open", @@ -350,17 +372,23 @@ describe("processRecords", () => { "target-1", 10, false, + 1, expect.any(Object), ); expect(mockDeliverPayload).toHaveBeenCalled(); }); - it("calls recordResult(true) on successful delivery when CB enabled", async () => { + it("calls recordResult with batch counts on successful delivery when CB enabled", async () => { const targetCb = { ...DEFAULT_TARGET, delivery: { circuitBreaker: { enabled: true } }, }; mockLoadTargetConfig.mockResolvedValue(targetCb); + mockAdmit.mockResolvedValue({ + allowed: true, + consumedTokens: 1, + effectiveRate: 10, + }); const failures = await processRecords([makeRecord()]); @@ -368,17 +396,23 @@ describe("processRecords", () => { expect(mockRecordResult).toHaveBeenCalledWith( expect.anything(), "target-1", - true, + 1, + 0, expect.any(Object), ); }); - it("calls recordResult(false) on 5xx before visibility change", async () => { + it("calls recordResult with failure count on 5xx when CB enabled", async () => { const targetCb = { ...DEFAULT_TARGET, delivery: { circuitBreaker: { enabled: true } }, }; mockLoadTargetConfig.mockResolvedValue(targetCb); + mockAdmit.mockResolvedValue({ + allowed: true, + consumedTokens: 1, + effectiveRate: 10, + }); mockDeliverPayload.mockResolvedValue({ outcome: "transient_failure", statusCode: 503, @@ -390,13 +424,14 @@ describe("processRecords", () => { expect(mockRecordResult).toHaveBeenCalledWith( expect.anything(), "target-1", - false, + 1, + 1, expect.any(Object), ); expect(mockChangeVisibility).toHaveBeenCalled(); }); - it("does not call recordResult on rate-limited path", async () => { + it("does not call recordResult on gate admission-denied path", async () => { mockAdmit.mockResolvedValue({ allowed: false, reason: "rate_limited", @@ -409,17 +444,6 @@ describe("processRecords", () => { expect(mockRecordResult).not.toHaveBeenCalled(); }); - it("does not call recordResult on 429 path", async () => { - mockDeliverPayload.mockResolvedValue({ - outcome: "rate_limited", - retryAfterHeader: "60", - }); - - await processRecords([makeRecord()]); - - expect(mockRecordResult).not.toHaveBeenCalled(); - }); - it("does not call recordResult when CB is disabled on transient failure", async () => { const targetNoCb = { ...DEFAULT_TARGET, @@ -449,7 +473,7 @@ describe("processRecords", () => { expect(mockRecordResult).not.toHaveBeenCalled(); }); - it("emits CircuitBreakerOpen metric when recordResult returns opened", async () => { + it("records CircuitBreakerOpen when recordResult returns opened", async () => { const targetCb = { ...DEFAULT_TARGET, delivery: { circuitBreaker: { enabled: true } }, @@ -461,16 +485,16 @@ describe("processRecords", () => { }); mockRecordResult.mockResolvedValue({ ok: false, state: "opened" }); - const { emitCircuitBreakerOpen } = jest.requireMock( - "services/delivery-metrics", + const { recordCircuitBreakerOpen } = jest.requireMock( + "services/delivery-observability", ); await processRecords([makeRecord()]); - expect(emitCircuitBreakerOpen).toHaveBeenCalledWith("target-1"); + expect(recordCircuitBreakerOpen).toHaveBeenCalledWith("target-1"); }); - it("does not emit CircuitBreakerOpen when recordResult returns failed", async () => { + it("does not record CircuitBreakerOpen when recordResult returns failed", async () => { const targetCb = { ...DEFAULT_TARGET, delivery: { circuitBreaker: { enabled: true } }, @@ -482,16 +506,16 @@ describe("processRecords", () => { }); mockRecordResult.mockResolvedValue({ ok: false, state: "failed" }); - const { emitCircuitBreakerOpen } = jest.requireMock( - "services/delivery-metrics", + const { recordCircuitBreakerOpen } = jest.requireMock( + "services/delivery-observability", ); await processRecords([makeRecord()]); - expect(emitCircuitBreakerOpen).not.toHaveBeenCalled(); + expect(recordCircuitBreakerOpen).not.toHaveBeenCalled(); }); - it("does not emit CircuitBreakerOpen when recordResult returns closed", async () => { + it("does not record CircuitBreakerOpen when recordResult returns closed", async () => { const targetCb = { ...DEFAULT_TARGET, delivery: { circuitBreaker: { enabled: true } }, @@ -503,26 +527,32 @@ describe("processRecords", () => { }); mockRecordResult.mockResolvedValue({ ok: true, state: "closed" }); - const { emitCircuitBreakerOpen } = jest.requireMock( - "services/delivery-metrics", + const { recordCircuitBreakerOpen } = jest.requireMock( + "services/delivery-observability", ); await processRecords([makeRecord()]); - expect(emitCircuitBreakerOpen).not.toHaveBeenCalled(); + expect(recordCircuitBreakerOpen).not.toHaveBeenCalled(); }); - it("emits RateLimited metric on 429 response", async () => { + it("records RateLimited on 429 response", async () => { mockDeliverPayload.mockResolvedValue({ outcome: "rate_limited", retryAfterHeader: "60", }); - const { emitRateLimited } = jest.requireMock("services/delivery-metrics"); + const { recordDeliveryRateLimited } = jest.requireMock( + "services/delivery-observability", + ); await processRecords([makeRecord()]); - expect(emitRateLimited).toHaveBeenCalledWith("target-1"); + expect(recordDeliveryRateLimited).toHaveBeenCalledWith( + "client-1", + "target-1", + "test-message-id", + ); }); it("uses configured maxRetryDurationSeconds when set on target", async () => { @@ -558,4 +588,30 @@ describe("processRecords", () => { 7_200_000, ); }); + + it("groups records by target and processes each batch separately", async () => { + const record1 = makeRecord({ messageId: "msg-1" }); + const record2 = makeRecord({ + messageId: "msg-2", + body: JSON.stringify({ + payload: { + data: [ + { + type: "MessageStatus", + attributes: { messageStatus: "delivered" }, + }, + ], + }, + subscriptionId: "sub-2", + targetId: "target-2", + }), + }); + + const failures = await processRecords([record1, record2]); + + expect(failures).toEqual([]); + expect(mockAdmit).toHaveBeenCalledTimes(2); + expect(mockLoadTargetConfig).toHaveBeenCalledWith("client-1", "target-1"); + expect(mockLoadTargetConfig).toHaveBeenCalledWith("client-1", "target-2"); + }); }); diff --git a/lambdas/https-client-lambda/src/__tests__/record-result-lua.test.ts b/lambdas/https-client-lambda/src/__tests__/record-result-lua.test.ts index 515f1377..5cc407fe 100644 --- a/lambdas/https-client-lambda/src/__tests__/record-result-lua.test.ts +++ b/lambdas/https-client-lambda/src/__tests__/record-result-lua.test.ts @@ -1,28 +1,30 @@ import recordResultLuaSrc from "services/record-result.lua"; import { createRedisStore, evalLua } from "__tests__/helpers/lua-redis-mock"; -// ARGV: [now, success, cooldownMs, decayPeriodMs, cbErrorThreshold, cbMinAttempts, cbWindowPeriodMs] -// KEYS: [cbKey] +// ARGV: [now, consumedTokens, processingFailures, cooldownPeriodMs, recoveryPeriodMs, failureThreshold, minAttempts, samplePeriodMs] +// KEYS: [epKey] // Returns: [ok (0|1), state] state: "closed" | "opened" | "failed" type RecordResultArgs = { now: number; - success: boolean; - cooldownMs: number; - decayPeriodMs: number; - cbErrorThreshold: number; - cbMinAttempts: number; - cbWindowPeriodMs: number; + consumedTokens: number; + processingFailures: number; + cooldownPeriodMs: number; + recoveryPeriodMs: number; + failureThreshold: number; + minAttempts: number; + samplePeriodMs: number; }; const defaultArgs: RecordResultArgs = { now: 1_000_000, - success: true, - cooldownMs: 60_000, - decayPeriodMs: 300_000, - cbErrorThreshold: 0.5, - cbMinAttempts: 10, - cbWindowPeriodMs: 60_000, + consumedTokens: 1, + processingFailures: 0, + cooldownPeriodMs: 120_000, + recoveryPeriodMs: 600_000, + failureThreshold: 0.3, + minAttempts: 5, + samplePeriodMs: 300_000, }; type RecordResultResult = [number, string]; @@ -35,15 +37,16 @@ function runRecordResult( const merged = { ...defaultArgs, ...args }; return evalLua( recordResultLuaSrc, - [`cb:${targetId}`], + [`ep:${targetId}`], [ merged.now.toString(), - merged.success ? "1" : "0", - merged.cooldownMs.toString(), - merged.decayPeriodMs.toString(), - merged.cbErrorThreshold.toString(), - merged.cbMinAttempts.toString(), - merged.cbWindowPeriodMs.toString(), + merged.consumedTokens.toString(), + merged.processingFailures.toString(), + merged.cooldownPeriodMs.toString(), + merged.recoveryPeriodMs.toString(), + merged.failureThreshold.toString(), + merged.minAttempts.toString(), + merged.samplePeriodMs.toString(), ], store, ) as RecordResultResult; @@ -51,79 +54,122 @@ function runRecordResult( describe("record-result.lua", () => { describe("success recording", () => { - it("returns closed state for a successful result", () => { + it("returns closed state for a successful batch", () => { const store = createRedisStore(); - const [ok, state] = runRecordResult(store, { success: true }); + store.set("ep:t1", new Map([["sample_till", "9999999999"]])); + + const [ok, state] = runRecordResult(store, { + consumedTokens: 5, + processingFailures: 0, + }); expect(ok).toBe(1); expect(state).toBe("closed"); }); - it("increments attempt count without incrementing failures", () => { + it("increments cur_attempts without incrementing cur_failures", () => { const store = createRedisStore(); - runRecordResult(store, { success: true }); + store.set("ep:t1", new Map([["sample_till", "9999999999"]])); - const cbHash = store.get("cb:t1")!; - expect(cbHash.get("cb_attempts")).toBe("1"); - expect(cbHash.get("cb_failures")).toBe("0"); + runRecordResult(store, { consumedTokens: 3, processingFailures: 0 }); + + const epHash = store.get("ep:t1")!; + expect(epHash.get("cur_attempts")).toBe("3"); + expect(epHash.get("cur_failures")).toBe("0"); }); }); describe("failure recording", () => { - it("increments both attempts and failures on error", () => { + it("increments both cur_attempts and cur_failures", () => { const store = createRedisStore(); - runRecordResult(store, { success: false }); + store.set("ep:t1", new Map([["sample_till", "9999999999"]])); + + runRecordResult(store, { consumedTokens: 5, processingFailures: 1 }); - const cbHash = store.get("cb:t1")!; - expect(cbHash.get("cb_attempts")).toBe("1"); - expect(cbHash.get("cb_failures")).toBe("1"); + const epHash = store.get("ep:t1")!; + expect(epHash.get("cur_attempts")).toBe("5"); + expect(epHash.get("cur_failures")).toBe("1"); }); - it("returns failed state for a single failure below threshold", () => { + it("returns failed state for failures below threshold", () => { const store = createRedisStore(); - const [ok, state] = runRecordResult(store, { success: false }); + store.set("ep:t1", new Map([["sample_till", "9999999999"]])); + + const [ok, state] = runRecordResult(store, { + consumedTokens: 1, + processingFailures: 1, + }); expect(ok).toBe(0); expect(state).toBe("failed"); }); + }); - it("stays closed when below error threshold", () => { + describe("recording guard — fully open", () => { + it("does not record attempts/failures when circuit is fully open", () => { const store = createRedisStore(); const now = 1_000_000; + const switchedAt = now - 10_000; + + store.set( + "ep:t1", + new Map([ + ["is_open", "1"], + ["switched_at", switchedAt.toString()], + ["sample_till", "9999999999"], + ["cur_attempts", "0"], + ["cur_failures", "0"], + ]), + ); - for (let i = 0; i < 8; i++) { - runRecordResult(store, { now, success: true }); - } - for (let i = 0; i < 2; i++) { - runRecordResult(store, { now, success: false }); - } + runRecordResult(store, { + now, + cooldownPeriodMs: 120_000, + consumedTokens: 5, + processingFailures: 3, + }); - const [ok, state] = runRecordResult(store, { now, success: true }); - expect(ok).toBe(1); - expect(state).toBe("closed"); + const epHash = store.get("ep:t1")!; + expect(epHash.get("cur_attempts")).toBe("0"); + expect(epHash.get("cur_failures")).toBe("0"); }); - }); - describe("circuit opening", () => { - it("opens circuit when error rate exceeds threshold", () => { + it("returns failed when circuit is fully open and state unchanged", () => { const store = createRedisStore(); const now = 1_000_000; + const switchedAt = now - 10_000; - for (let i = 0; i < 4; i++) { - const [, state] = runRecordResult(store, { - now, - success: false, - cbMinAttempts: 5, - cbErrorThreshold: 0.5, - }); - expect(state).toBe("failed"); - } + store.set( + "ep:t1", + new Map([ + ["is_open", "1"], + ["switched_at", switchedAt.toString()], + ["sample_till", "9999999999"], + ]), + ); const [ok, state] = runRecordResult(store, { now, - success: false, - cbMinAttempts: 5, - cbErrorThreshold: 0.5, + cooldownPeriodMs: 120_000, + consumedTokens: 1, + processingFailures: 0, + }); + + expect(ok).toBe(0); + expect(state).toBe("failed"); + }); + }); + + describe("circuit opening", () => { + it("opens circuit when failure rate exceeds threshold", () => { + const store = createRedisStore(); + store.set("ep:t1", new Map([["sample_till", "9999999999"]])); + + const [ok, state] = runRecordResult(store, { + consumedTokens: 5, + processingFailures: 5, + minAttempts: 5, + failureThreshold: 0.3, }); expect(ok).toBe(0); expect(state).toBe("opened"); @@ -131,243 +177,213 @@ describe("record-result.lua", () => { it("does not open circuit when below minimum attempts", () => { const store = createRedisStore(); - const now = 1_000_000; - - for (let i = 0; i < 4; i++) { - runRecordResult(store, { - now, - success: false, - cbMinAttempts: 10, - }); - } + store.set("ep:t1", new Map([["sample_till", "9999999999"]])); const [ok, state] = runRecordResult(store, { - now, - success: false, - cbMinAttempts: 10, + consumedTokens: 3, + processingFailures: 3, + minAttempts: 5, + failureThreshold: 0.3, }); expect(ok).toBe(0); expect(state).toBe("failed"); }); - it("sets opened_until_ms with cooldown on open", () => { + it("sets is_open and switched_at on open", () => { const store = createRedisStore(); const now = 1_000_000; - const cooldownMs = 30_000; - - for (let i = 0; i < 5; i++) { - runRecordResult(store, { - now, - success: false, - cbMinAttempts: 5, - cbErrorThreshold: 0.5, - cooldownMs, - }); - } - - const cbHash = store.get("cb:t1")!; - expect(Number(cbHash.get("opened_until_ms"))).toBe(now + cooldownMs); - }); + store.set("ep:t1", new Map([["sample_till", "9999999999"]])); - it("resets all counters on open", () => { - const store = createRedisStore(); - const now = 1_000_000; + runRecordResult(store, { + now, + consumedTokens: 5, + processingFailures: 5, + minAttempts: 5, + failureThreshold: 0.3, + }); - for (let i = 0; i < 5; i++) { - runRecordResult(store, { - now, - success: false, - cbMinAttempts: 5, - cbErrorThreshold: 0.5, - }); - } - - const cbHash = store.get("cb:t1")!; - expect(cbHash.get("cb_failures")).toBe("0"); - expect(cbHash.get("cb_attempts")).toBe("0"); - expect(cbHash.get("cb_window_from")).toBe("0"); - expect(cbHash.get("cb_prev_failures")).toBe("0"); - expect(cbHash.get("cb_prev_attempts")).toBe("0"); + const epHash = store.get("ep:t1")!; + expect(epHash.get("is_open")).toBe("1"); + expect(Number(epHash.get("switched_at"))).toBe(now); }); - it("does not double-trip when circuit is already open", () => { + it("resets all counters and sets sampleTill on open", () => { const store = createRedisStore(); const now = 1_000_000; - const openedUntil = now + 60_000; + const samplePeriodMs = 300_000; + store.set("ep:t1", new Map([["sample_till", "9999999999"]])); - store.set( - "cb:t1", - new Map([ - ["opened_until_ms", openedUntil.toString()], - ["cb_window_from", now.toString()], - ]), - ); + runRecordResult(store, { + now, + consumedTokens: 5, + processingFailures: 5, + minAttempts: 5, + failureThreshold: 0.3, + samplePeriodMs, + }); - for (let i = 0; i < 20; i++) { - const [, state] = runRecordResult(store, { - now, - success: false, - cbMinAttempts: 5, - cbErrorThreshold: 0.5, - }); - expect(state).toBe("failed"); - } - - const cbHash = store.get("cb:t1")!; - expect(Number(cbHash.get("opened_until_ms"))).toBe(openedUntil); + const epHash = store.get("ep:t1")!; + expect(epHash.get("cur_failures")).toBe("0"); + expect(epHash.get("cur_attempts")).toBe("0"); + expect(epHash.get("prev_failures")).toBe("0"); + expect(epHash.get("prev_attempts")).toBe("0"); + expect(Number(epHash.get("sample_till"))).toBe(now + samplePeriodMs); }); }); - describe("two-window blended rate", () => { - it("blends previous window failures into current assessment", () => { + describe("circuit closing — half-open with successes", () => { + it("closes circuit when half-open and batch has successes", () => { const store = createRedisStore(); const now = 1_000_000; - const cbWindowPeriodMs = 60_000; + const switchedAt = now - 130_000; store.set( - "cb:t1", + "ep:t1", new Map([ - ["cb_window_from", now.toString()], - ["cb_prev_failures", "8"], - ["cb_prev_attempts", "10"], + ["is_open", "1"], + ["switched_at", switchedAt.toString()], + ["sample_till", "9999999999"], ]), ); const [ok, state] = runRecordResult(store, { now, - success: false, - cbWindowPeriodMs, - cbMinAttempts: 5, - cbErrorThreshold: 0.5, + cooldownPeriodMs: 120_000, + consumedTokens: 1, + processingFailures: 0, }); - expect(ok).toBe(0); - expect(state).toBe("opened"); - }); - it("reduces previous window weight as current window ages", () => { - const store = createRedisStore(); - const cbWindowPeriodMs = 100_000; - const t0 = 1_000_000; - const nearEnd = t0 + cbWindowPeriodMs - 1; - - store.set( - "cb:t1", - new Map([ - ["cb_window_from", t0.toString()], - ["cb_prev_failures", "10"], - ["cb_prev_attempts", "10"], - ]), - ); + expect(ok).toBe(1); + expect(state).toBe("closed"); - for (let i = 0; i < 20; i++) { - runRecordResult(store, { - now: nearEnd, - success: true, - cbWindowPeriodMs, - cbMinAttempts: 5, - cbErrorThreshold: 0.5, - }); - } - - const [, state] = runRecordResult(store, { - now: nearEnd, - success: false, - cbWindowPeriodMs, - cbMinAttempts: 5, - cbErrorThreshold: 0.5, - }); - expect(state).toBe("failed"); + const epHash = store.get("ep:t1")!; + expect(epHash.get("is_open")).toBe("0"); + expect(Number(epHash.get("switched_at"))).toBe(now); }); - it("ignores previous window when cbWindowPeriodMs is 0", () => { + it("does not close when half-open but all attempts failed", () => { const store = createRedisStore(); const now = 1_000_000; + const switchedAt = now - 130_000; store.set( - "cb:t1", + "ep:t1", new Map([ - ["cb_window_from", now.toString()], - ["cb_prev_failures", "100"], - ["cb_prev_attempts", "100"], + ["is_open", "1"], + ["switched_at", switchedAt.toString()], + ["sample_till", "9999999999"], ]), ); - const [, state] = runRecordResult(store, { + const [ok, state] = runRecordResult(store, { now, - success: false, - cbWindowPeriodMs: 0, - cbMinAttempts: 5, - cbErrorThreshold: 0.5, + cooldownPeriodMs: 120_000, + consumedTokens: 1, + processingFailures: 1, }); + + expect(ok).toBe(0); expect(state).toBe("failed"); }); }); - describe("decay period", () => { - it("preserves opened_until_ms during active decay", () => { + describe("sliding window management", () => { + it("promotes current to previous when sampleTill expires", () => { const store = createRedisStore(); - const openedUntil = 1_060_000; - const duringDecay = openedUntil + 100_000; + const now = 1_000_000; + const samplePeriodMs = 300_000; + const sampleTill = now - 1; store.set( - "cb:t1", - new Map([["opened_until_ms", openedUntil.toString()]]), + "ep:t1", + new Map([ + ["sample_till", sampleTill.toString()], + ["cur_attempts", "10"], + ["cur_failures", "3"], + ["prev_attempts", "0"], + ["prev_failures", "0"], + ]), ); - runRecordResult(store, { - now: duringDecay, - success: true, - decayPeriodMs: 300_000, - }); + runRecordResult(store, { now, samplePeriodMs, consumedTokens: 1 }); - const cbHash = store.get("cb:t1")!; - expect(Number(cbHash.get("opened_until_ms"))).toBe(openedUntil); + const epHash = store.get("ep:t1")!; + expect(epHash.get("prev_attempts")).toBe("10"); + expect(epHash.get("prev_failures")).toBe("3"); + expect(Number(epHash.get("sample_till"))).toBe( + sampleTill + samplePeriodMs, + ); }); - it("clears opened_until_ms after decay period elapses", () => { + it("complete reset when window is too old", () => { const store = createRedisStore(); - const openedUntil = 1_060_000; - const decayPeriodMs = 300_000; - const afterDecay = openedUntil + decayPeriodMs + 1; + const now = 1_000_000; + const samplePeriodMs = 300_000; + const sampleTill = now - samplePeriodMs - 1; store.set( - "cb:t1", - new Map([["opened_until_ms", openedUntil.toString()]]), + "ep:t1", + new Map([ + ["sample_till", sampleTill.toString()], + ["cur_attempts", "10"], + ["cur_failures", "3"], + ["prev_attempts", "5"], + ["prev_failures", "2"], + ]), ); - runRecordResult(store, { - now: afterDecay, - success: true, - decayPeriodMs, - }); + runRecordResult(store, { now, samplePeriodMs, consumedTokens: 1 }); - const cbHash = store.get("cb:t1")!; - expect(cbHash.get("opened_until_ms")).toBe("0"); + const epHash = store.get("ep:t1")!; + expect(epHash.get("prev_attempts")).toBe("0"); + expect(epHash.get("prev_failures")).toBe("0"); + expect(Number(epHash.get("sample_till"))).toBe(now + samplePeriodMs); }); - it("clears opened_until_ms when circuit was never opened", () => { + it("interpolates using weight from sampleTill", () => { const store = createRedisStore(); + const samplePeriodMs = 300_000; const now = 1_000_000; + const sampleTill = now + samplePeriodMs; - runRecordResult(store, { now, success: true }); + store.set( + "ep:t1", + new Map([ + ["sample_till", sampleTill.toString()], + ["prev_attempts", "10"], + ["prev_failures", "10"], + ]), + ); - const cbHash = store.get("cb:t1")!; - expect(cbHash.get("opened_until_ms")).toBe("0"); + // weight = (sampleTill - now) / samplePeriodMs = 1.0 + // interpolated attempts = 10 * 1.0 + 5 = 15 (>= minAttempts 5) + // interpolated failures = 10 * 1.0 + 5 = 15 + // failure rate = 15/15 = 1.0 > 0.3 → opens + const [ok, state] = runRecordResult(store, { + now, + samplePeriodMs, + consumedTokens: 5, + processingFailures: 5, + minAttempts: 5, + failureThreshold: 0.3, + }); + expect(ok).toBe(0); + expect(state).toBe("opened"); }); }); describe("state persistence", () => { - it("writes all counter fields to redis", () => { + it("writes all sampling fields to redis", () => { const store = createRedisStore(); + store.set("ep:t1", new Map([["sample_till", "9999999999"]])); runRecordResult(store); - const cbHash = store.get("cb:t1")!; - expect(cbHash.has("opened_until_ms")).toBe(true); - expect(cbHash.has("cb_window_from")).toBe(true); - expect(cbHash.has("cb_failures")).toBe(true); - expect(cbHash.has("cb_attempts")).toBe(true); - expect(cbHash.has("cb_prev_failures")).toBe(true); - expect(cbHash.has("cb_prev_attempts")).toBe(true); + const epHash = store.get("ep:t1")!; + expect(epHash.has("cur_attempts")).toBe(true); + expect(epHash.has("cur_failures")).toBe(true); + expect(epHash.has("prev_attempts")).toBe(true); + expect(epHash.has("prev_failures")).toBe(true); + expect(epHash.has("sample_till")).toBe(true); }); }); }); diff --git a/lambdas/https-client-lambda/src/handler.ts b/lambdas/https-client-lambda/src/handler.ts index 28fcc6b9..764e7397 100644 --- a/lambdas/https-client-lambda/src/handler.ts +++ b/lambdas/https-client-lambda/src/handler.ts @@ -12,7 +12,6 @@ import { OUTCOME_SUCCESS, deliverPayload, } from "services/delivery/https-client"; -import type { DeliveryResult } from "services/delivery/https-client"; import { sendToDlq } from "services/dlq-sender"; import { changeVisibility } from "services/sqs-visibility"; import { @@ -26,7 +25,6 @@ import { recordResult, } from "services/endpoint-gate"; import { getRedisClient } from "services/redis-client"; -import { VisibilityManagedError } from "services/visibility-managed-error"; import { recordAdmissionDenied, recordCircuitBreakerClosed, @@ -47,13 +45,20 @@ const DEFAULT_MAX_RETRY_DURATION_MS = 7_200_000; // 2 hours const DEFAULT_CONCURRENCY_LIMIT = 5; const gateConfig: EndpointGateConfig = { - burstCapacity: Number(process.env.TOKEN_BUCKET_BURST_CAPACITY ?? "10"), - cbProbeIntervalMs: Number(process.env.CB_PROBE_INTERVAL_MS ?? "60000"), - decayPeriodMs: Number(process.env.CB_DECAY_PERIOD_MS ?? "300000"), - cbWindowPeriodMs: Number(process.env.CB_WINDOW_PERIOD_MS ?? "60000"), - cbErrorThreshold: Number(process.env.CB_ERROR_THRESHOLD ?? "0.5"), - cbMinAttempts: Number(process.env.CB_MIN_ATTEMPTS ?? "10"), - cbCooldownMs: Number(process.env.CB_COOLDOWN_MS ?? "60000"), + // Max tokens the bucket can hold — absorbs short traffic bursts without throttling (default: 2250) + burstCapacity: Number(process.env.TOKEN_BUCKET_BURST_CAPACITY ?? "2250"), + // Probe rate to test endpoint recovery when half-open (default: 1/60 req/s) + probeRateLimit: Number(process.env.CB_PROBE_RATE_LIMIT ?? String(1 / 60)), + // Linear ramp-up after circuit closes, avoids flooding a freshly recovered endpoint (default: 10 min) + recoveryPeriodMs: Number(process.env.CB_RECOVERY_PERIOD_MS ?? "600000"), + // Sliding window over which failure rates are sampled (default: 5 min) + samplePeriodMs: Number(process.env.CB_SAMPLE_PERIOD_MS ?? "300000"), + // Failure rate within the sample window that triggers circuit open (default: 30%) + failureThreshold: Number(process.env.CB_FAILURE_THRESHOLD ?? "0.3"), + // Minimum attempts in the sample window before the failure rate is evaluated (default: 5 attempts) + minAttempts: Number(process.env.CB_MIN_ATTEMPTS ?? "5"), + // Full block after circuit opens, before half-open probes begin (default: 2 min) + cooldownPeriodMs: Number(process.env.CB_COOLDOWN_PERIOD_MS ?? "120000"), }; type CallbackDeliveryMessage = { @@ -62,223 +67,252 @@ type CallbackDeliveryMessage = { targetId: string; }; -async function checkAdmission( - redis: RedisClientType, - targetId: string, - invocationRateLimit: number, - cbEnabled: boolean, - clientId: string, - record: SQSRecord, - correlationId?: string, -): Promise { - const gateResult = await admit( - redis, - targetId, - invocationRateLimit, - cbEnabled, - gateConfig, - ); +type TargetBatch = { + targetId: string; + records: SQSRecord[]; + messages: CallbackDeliveryMessage[]; +}; - if (!gateResult.allowed) { - const delaySec = Math.ceil(gateResult.retryAfterMs / 1000); - recordAdmissionDenied(clientId, targetId, gateResult.reason, correlationId); - await changeVisibility(record.receiptHandle, delaySec); - throw new VisibilityManagedError(`Admission denied: ${gateResult.reason}`); +function groupByTarget(records: SQSRecord[]): TargetBatch[] { + const groups = new Map< + string, + { records: SQSRecord[]; messages: CallbackDeliveryMessage[] } + >(); + + for (const record of records) { + const message: CallbackDeliveryMessage = JSON.parse(record.body); + const existing = groups.get(message.targetId); + if (existing) { + existing.records.push(record); + existing.messages.push(message); + } else { + groups.set(message.targetId, { records: [record], messages: [message] }); + } } -} -const OUTCOME_DELIVERED = "delivered" as const; -const OUTCOME_DLQ = "dlq" as const; -type RecordOutcome = typeof OUTCOME_DELIVERED | typeof OUTCOME_DLQ; + return [...groups.entries()].map( + ([targetId, { messages, records: recs }]) => ({ + targetId, + records: recs, + messages, + }), + ); +} -async function handleDeliveryResult( - result: DeliveryResult, +async function deliverRecord( record: SQSRecord, - redis: RedisClientType, + message: CallbackDeliveryMessage, + target: Awaited>, + applicationId: string, clientId: string, - targetId: string, - cbEnabled: boolean, - correlationId?: string, -): Promise { +): Promise<{ success: boolean }> { + const correlationId = message.payload.data[0]?.attributes?.messageId; + + const maxRetryDurationMs = + target.delivery?.maxRetryDurationSeconds === undefined + ? DEFAULT_MAX_RETRY_DURATION_MS + : target.delivery.maxRetryDurationSeconds * 1000; + + const firstReceivedMs = Number( + record.attributes.ApproximateFirstReceiveTimestamp, + ); + + if (isWindowExhausted(firstReceivedMs, maxRetryDurationMs)) { + recordRetryWindowExhausted(clientId, message.targetId, correlationId); + await sendToDlq(record.body); + return { success: true }; + } + + const agent = await buildAgent(target); + const signature = signPayload( + applicationId, + target.apiKey.headerValue, + message.payload, + ); + const payloadJson = JSON.stringify(message.payload); + + recordDeliveryAttempt(clientId, message.targetId, correlationId); + const deliveryStart = Date.now(); + const result = await deliverPayload(target, payloadJson, signature, agent); + recordDeliveryDuration(message.targetId, Date.now() - deliveryStart); + if (result.outcome === OUTCOME_SUCCESS) { - if (cbEnabled) { - const cbOutcome = await recordResult(redis, targetId, true, gateConfig); - if (cbOutcome.ok && cbOutcome.state === "closed") { - recordCircuitBreakerClosed(targetId, correlationId); - } - } - recordDeliverySuccess(clientId, targetId, correlationId); - return OUTCOME_DELIVERED; + recordDeliverySuccess(clientId, message.targetId, correlationId); + return { success: true }; } if (result.outcome === OUTCOME_PERMANENT_FAILURE) { recordDeliveryPermanentFailure( clientId, - targetId, + message.targetId, result.statusCode, result.errorCode, correlationId, ); await sendToDlq(record.body, result); - return OUTCOME_DLQ; + return { success: true }; } if (result.outcome === OUTCOME_RATE_LIMITED) { const receiveCount = Number(record.attributes.ApproximateReceiveCount); - recordDeliveryRateLimited(clientId, targetId, correlationId); + recordDeliveryRateLimited(clientId, message.targetId, correlationId); await handleRateLimitedRecord( record, clientId, - targetId, + message.targetId, result.retryAfterHeader, receiveCount, ); - return OUTCOME_DELIVERED; // unreachable — handleRateLimitedRecord always throws + return { success: true }; } const receiveCount = Number(record.attributes.ApproximateReceiveCount); const backoffSec = jitteredBackoffSeconds(receiveCount); - if (cbEnabled) { - const cbOutcome = await recordResult(redis, targetId, false, gateConfig); - if (cbOutcome.state === "opened") { - recordCircuitBreakerOpen(targetId, correlationId); - } - } recordDeliveryFailure( clientId, - targetId, + message.targetId, result.statusCode, backoffSec, receiveCount, correlationId, ); await changeVisibility(record.receiptHandle, backoffSec); - throw new VisibilityManagedError(`Transient failure: ${result.statusCode}`); + return { success: false }; } -async function processRecord( - record: SQSRecord, +async function processTargetBatch( + batch: TargetBatch, redis: RedisClientType, -): Promise { - const { CLIENT_ID } = process.env; - if (!CLIENT_ID) { - throw new Error("CLIENT_ID is required"); + clientId: string, + concurrencyLimit: number, +): Promise { + const target = await loadTargetConfig(clientId, batch.targetId); + const cbEnabled = target.delivery?.circuitBreaker?.enabled ?? false; + + const gateResult = await admit( + redis, + batch.targetId, + target.invocationRateLimit, + cbEnabled, + batch.records.length, + gateConfig, + ); + + if (!gateResult.allowed) { + const delaySec = Math.ceil(gateResult.retryAfterMs / 1000); + recordAdmissionDenied(clientId, batch.targetId, gateResult.reason); + const failures: SQSBatchItemFailure[] = []; + for (const record of batch.records) { + await changeVisibility(record.receiptHandle, delaySec); + failures.push({ itemIdentifier: record.messageId }); + } + return failures; } - const message: CallbackDeliveryMessage = JSON.parse(record.body); - const { payload, targetId } = message; - const messageId = payload.data[0]?.attributes?.messageId; + const { consumedTokens } = gateResult; + const admitted = batch.records.slice(0, consumedTokens); + const rejected = batch.records.slice(consumedTokens); + const admittedMessages = batch.messages.slice(0, consumedTokens); - logger.info("Processing delivery", { - clientId: CLIENT_ID, - targetId, - messageId, - sqsMessageId: record.messageId, - receiveCount: record.attributes.ApproximateReceiveCount, - }); + const applicationId = await getApplicationId(clientId); - const target = await loadTargetConfig(CLIENT_ID, targetId); - const maxRetryDurationMs = - target.delivery?.maxRetryDurationSeconds === undefined - ? DEFAULT_MAX_RETRY_DURATION_MS - : target.delivery.maxRetryDurationSeconds * 1000; + const failures: SQSBatchItemFailure[] = []; + let processingFailures = 0; - const firstReceivedMs = Number( - record.attributes.ApproximateFirstReceiveTimestamp, + const deliveryResults = await pMap( + admitted, + async (record, index): Promise<{ record: SQSRecord; success: boolean }> => { + try { + const outcome = await deliverRecord( + record, + admittedMessages[index], + target, + applicationId, + clientId, + ); + return { record, success: outcome.success }; + } catch (error) { + logger.error("Failed to process record", { + messageId: record.messageId, + err: error, + }); + const receiveCount = Number(record.attributes.ApproximateReceiveCount); + await changeVisibility( + record.receiptHandle, + jitteredBackoffSeconds(receiveCount), + ); + return { record, success: false }; + } + }, + { concurrency: concurrencyLimit }, ); - if (isWindowExhausted(firstReceivedMs, maxRetryDurationMs)) { - recordRetryWindowExhausted(CLIENT_ID, targetId, messageId); - await sendToDlq(record.body); - return OUTCOME_DLQ; + for (const { record, success } of deliveryResults) { + if (!success) { + processingFailures += 1; + failures.push({ itemIdentifier: record.messageId }); + } } - const applicationId = await getApplicationId(CLIENT_ID); - const cbEnabled = target.delivery?.circuitBreaker?.enabled ?? false; - - await checkAdmission( - redis, - targetId, - target.invocationRateLimit, - cbEnabled, - CLIENT_ID, - record, - messageId, - ); - - const agent = await buildAgent(target); - const signature = signPayload( - applicationId, - target.apiKey.headerValue, - payload, - ); - const payloadJson = JSON.stringify(payload); + if (cbEnabled && consumedTokens > 0) { + const cbOutcome = await recordResult( + redis, + batch.targetId, + consumedTokens, + processingFailures, + gateConfig, + ); + if (!cbOutcome.ok && cbOutcome.state === "opened") { + recordCircuitBreakerOpen(batch.targetId); + } + if (cbOutcome.ok && cbOutcome.state === "closed") { + recordCircuitBreakerClosed(batch.targetId); + } + } - recordDeliveryAttempt(CLIENT_ID, targetId, messageId); - const deliveryStart = Date.now(); - const result = await deliverPayload(target, payloadJson, signature, agent); - recordDeliveryDuration(targetId, Date.now() - deliveryStart); + for (const record of rejected) { + failures.push({ itemIdentifier: record.messageId }); + } - return handleDeliveryResult( - result, - record, - redis, - CLIENT_ID, - targetId, - cbEnabled, - messageId, - ); + return failures; } export async function processRecords( records: SQSRecord[], ): Promise { - resetMetrics(); + const { CLIENT_ID } = process.env; + if (!CLIENT_ID) { + throw new Error("CLIENT_ID is required"); + } - logger.info("Batch received", { batchSize: records.length }); + resetMetrics(); const concurrencyLimit = Number( process.env.CONCURRENCY_LIMIT ?? String(DEFAULT_CONCURRENCY_LIMIT), ); + logger.info("Batch received", { batchSize: records.length }); + const redis = await getRedisClient(); + const targetBatches = groupByTarget(records); - const results = await pMap( - records, - async (record): Promise => { - try { - return await processRecord(record, redis); - } catch (error) { - if (!(error instanceof VisibilityManagedError)) { - logger.error("Failed to process record", { - messageId: record.messageId, - err: error, - }); - const receiveCount = Number( - record.attributes.ApproximateReceiveCount, - ); - await changeVisibility( - record.receiptHandle, - jitteredBackoffSeconds(receiveCount), - ); - } - return { itemIdentifier: record.messageId }; - } - }, - { concurrency: concurrencyLimit }, - ); + const allFailures: SQSBatchItemFailure[] = []; + + for (const batch of targetBatches) { + const batchFailures = await processTargetBatch( + batch, + redis, + CLIENT_ID, + concurrencyLimit, + ); + allFailures.push(...batchFailures); + } - await flushMetrics(); - const failures = results.filter( - (r): r is SQSBatchItemFailure => typeof r === "object", - ); - const deliveredCount = results.filter((r) => r === OUTCOME_DELIVERED).length; - const dlqCount = results.filter((r) => r === OUTCOME_DLQ).length; logger.info("Batch complete", { batchSize: records.length, - deliveredCount, - dlqCount, - failureCount: failures.length, + failureCount: allFailures.length, }); - return failures; + + await flushMetrics(); + return allFailures; } diff --git a/lambdas/https-client-lambda/src/services/admit.lua b/lambdas/https-client-lambda/src/services/admit.lua index fd56decb..36809e40 100644 --- a/lambdas/https-client-lambda/src/services/admit.lua +++ b/lambdas/https-client-lambda/src/services/admit.lua @@ -1,203 +1,98 @@ --- admit.lua — Decides whether a request to an endpoint is allowed. +-- admit.lua — Pre-processing: determines rate limit and consumes tokens. -- --- Three sequential checks run atomically: --- 1. Circuit breaker — is the endpoint currently healthy? --- 2. Sliding window — roll the two-window error-rate accounting state if needed --- 3. Token bucket — is the endpoint within its rate limit? +-- Two sequential steps run atomically: +-- 1. Circuit breaker — determine effective rate from circuit state +-- 2. Token bucket — consume tokens for the target batch -- --- A request is allowed only when all three checks pass. +-- The circuit has four states: +-- Open (during cooldown): rate = 0, complete block, bucket untouched +-- Half-open (after cooldown): rate = probeRateLimit +-- Recovering (closed, during recovery period): linear ramp-up +-- Normal (closed): full configured rate -- --- While the circuit is open, a timed probe is let through at most once per --- cbProbeIntervalMs so the caller can test whether the endpoint has recovered. --- The probe bypasses the rate limit — counting it here would skew a --- low-volume probe signal against the recovery decision. --- --- After the circuit closes, the token fill rate ramps up linearly from --- near-zero to full over decayPeriodMs to avoid a thundering herd on recovery. --- --- Returns: { allowed (0|1), reason, retryAfterMs, effectiveRate } +-- Returns: { consumedTokens, reason, retryAfterMs, effectiveRate } -- Keys -local cbKey = KEYS[1] -- cb:{endpoint} circuit breaker state hash -local rlKey = KEYS[2] -- rl:{endpoint} rate limiter state hash +local epKey = KEYS[1] -- ep:{targetId} combined endpoint state hash -- Arguments -local now = tonumber(ARGV[1]) or 0 -- current wall-clock time (ms) -local capacity = tonumber(ARGV[2]) or 0 -- token bucket maximum capacity -local refillPerSec = tonumber(ARGV[3]) or 0 -- full token fill rate (tokens/sec) -local cooldownMs = tonumber(ARGV[4]) or 0 -- how long the circuit stays open (ms) -local decayPeriodMs = tonumber(ARGV[5]) or 0 -- ramp-up window after circuit closes (ms) -local cbWindowPeriodMs = tonumber(ARGV[6]) or 0 -- error-rate sliding window duration (ms) -local cbProbeIntervalMs = tonumber(ARGV[7]) or 0 -- minimum gap between probe requests (ms; 0 = no probes) - --- TTL policy: circuit breaker state must outlive the cooldown window so that --- the ramp-up period remains visible to subsequent calls after a close. --- Rate limiter state needs only a short idle window. -local cbTtlSeconds = math.ceil(cooldownMs / 1000) + 60 -local rlTtlSeconds = 120 +local now = tonumber(ARGV[1]) or 0 +local capacity = tonumber(ARGV[2]) or 0 +local targetRateLimit = tonumber(ARGV[3]) or 0 +local cooldownMs = tonumber(ARGV[4]) or 0 +local recoveryPeriodMs = tonumber(ARGV[5]) or 0 +local probeRateLimit = tonumber(ARGV[6]) or 0 +local targetBatchSize = tonumber(ARGV[7]) or 0 -------------------------------------------------------------------------------- -- LOAD STATE -------------------------------------------------------------------------------- -local cb = redis.call("HMGET", cbKey, - "opened_until_ms", "cb_window_from", "cb_failures", "cb_attempts", "last_probe_ms", - "cb_prev_failures", "cb_prev_attempts") -local openedUntil = tonumber(cb[1] or "0") -local cbWindowFrom = tonumber(cb[2] or "0") -local cbFailures = tonumber(cb[3] or "0") -local cbAttempts = tonumber(cb[4] or "0") -local lastProbeMs = tonumber(cb[5] or "0") -local cbPrevFailures = tonumber(cb[6] or "0") -local cbPrevAttempts = tonumber(cb[7] or "0") - -local rl = redis.call("HMGET", rlKey, "tokens", "last_refill_ms") -local tokens = tonumber(rl[1] or capacity) -local lastRefill = tonumber(rl[2] or now) +local state = redis.call("HMGET", epKey, + "is_open", "switched_at", "bucket_tokens", "bucket_refilled_at") +local isOpen = tonumber(state[1] or "0") == 1 +local switchedAtRaw = state[2] +local switchedAt = tonumber(switchedAtRaw or tostring(now)) +local bucketTokens = tonumber(state[3] or "0") +local bucketRefilledAt = tonumber(state[4] or "0") +local needInitSwitchedAt = switchedAtRaw == false or switchedAtRaw == nil -------------------------------------------------------------------------------- --- 1. CIRCUIT BREAKER --- --- The circuit is open when openedUntil is set and has not yet elapsed. --- All requests are rejected while open to give the endpoint time to recover. --- --- Timed probes: once per cbProbeIntervalMs a single request is allowed --- through even while the circuit is open. The caller must record the --- outcome via record-result.lua; a successful probe will close the circuit --- and trigger the ramp-up phase. +-- 1. CIRCUIT BREAKER — determine effective rate -------------------------------------------------------------------------------- -if openedUntil > 0 and now < openedUntil then - -- Allow a probe through if the probe interval has elapsed - if cbProbeIntervalMs > 0 and (now - lastProbeMs) >= cbProbeIntervalMs then - lastProbeMs = now - redis.call("HSET", cbKey, - "opened_until_ms", openedUntil, - "cb_window_from", cbWindowFrom, - "cb_failures", cbFailures, - "cb_attempts", cbAttempts, - "last_probe_ms", lastProbeMs, - "cb_prev_failures", cbPrevFailures, - "cb_prev_attempts", cbPrevAttempts - ) - redis.call("EXPIRE", cbKey, cbTtlSeconds) - return { 1, "probe", 0, 0 } - end +local isHalfOpen = isOpen and now > switchedAt + cooldownMs +local isRecovering = (not isOpen) and now < switchedAt + recoveryPeriodMs - -- Circuit is open and no probe slot is available — reject - return { 0, "circuit_open", openedUntil - now, 0 } -end - --------------------------------------------------------------------------------- --- 2. SLIDING WINDOW --- --- Two windows (current + previous) together approximate a sliding window over --- cbWindowPeriodMs. When the current window expires it is promoted to previous --- and a fresh current window starts. record-result.lua blends the two windows --- using a time-based weight to smooth the error rate across the boundary rather --- than resetting it to zero at expiry. --- --- record-result.lua is responsible for incrementing the counters; this script --- is only responsible for rolling the window boundary forward when it expires. --------------------------------------------------------------------------------- +local effectiveRate -if cbWindowFrom == 0 then - -- No window exists yet — start one now - cbWindowFrom = now -elseif (now - cbWindowFrom) > cbWindowPeriodMs then - -- Current window has expired — roll it forward - if (now - cbWindowFrom) > (2 * cbWindowPeriodMs) then - -- Both current and previous windows are stale: a long quiet period means - -- old failure counts are no longer relevant to the health of the endpoint. - cbPrevFailures = 0 - cbPrevAttempts = 0 +if isOpen then + if isHalfOpen then + effectiveRate = probeRateLimit else - -- Promote current → previous so it can be blended with the new current window - cbPrevFailures = cbFailures - cbPrevAttempts = cbAttempts + return { 0, "circuit_open", (switchedAt + cooldownMs) - now, 0 } + end +else + if isRecovering then + effectiveRate = targetRateLimit * (now - switchedAt) / recoveryPeriodMs + else + effectiveRate = targetRateLimit end - cbFailures = 0 - cbAttempts = 0 - cbWindowFrom = now end -------------------------------------------------------------------------------- --- 3. TOKEN BUCKET +-- 2. TOKEN BUCKET — batch consumption -- --- Refills tokens based on elapsed time, then tries to consume one. --- If no tokens are available the request is rate-limited. +-- Generate tokens based on elapsed time, then consume as many as needed for +-- the batch, up to the number available. -- --- Ramp-up: after the circuit closes (openedUntil is set but in the past), --- effectiveRate scales linearly from near-zero to the full refillPerSec over --- decayPeriodMs. This deliberately slows recovery traffic so a flapping --- endpoint is not immediately overwhelmed. --- Once decayPeriodMs elapses, openedUntil is cleared and the full rate resumes. +-- Refill precision: bucketRefilledAt advances by exactly the time required to +-- generate the whole tokens (not set to `now`), preserving fractional time. -------------------------------------------------------------------------------- -local effectiveRate = refillPerSec - -if openedUntil > 0 and now > openedUntil and decayPeriodMs > 0 then - -- Circuit has recently closed — apply linear ramp-up - local sinceClose = now - openedUntil - if sinceClose >= decayPeriodMs then - -- Decay period fully elapsed — restore full rate and clear the CB timestamp - openedUntil = 0 - else - -- Still within decay period — scale fill rate proportionally to time elapsed - local fraction = sinceClose / decayPeriodMs - effectiveRate = math.max(1, math.floor(refillPerSec * fraction)) - end -end - --- Refill tokens based on time elapsed since last refill -local elapsed = now - lastRefill -if elapsed > 0 then - local refill = math.floor((elapsed * effectiveRate) / 1000) - if refill > 0 then - tokens = math.min(capacity, tokens + refill) - lastRefill = now - end -end +local generatedTokens = math.floor((now - bucketRefilledAt) * effectiveRate / 1000) +local availTokens = math.min(capacity, bucketTokens + generatedTokens) +local consumedTokens = math.min(targetBatchSize, availTokens) --- Not enough tokens — rate-limited --- TTL is intentionally not refreshed here; it was set on the last allowed call. -if tokens < 1 then - redis.call("HSET", cbKey, - "opened_until_ms", openedUntil, - "cb_window_from", cbWindowFrom, - "cb_failures", cbFailures, - "cb_attempts", cbAttempts, - "cb_prev_failures", cbPrevFailures, - "cb_prev_attempts", cbPrevAttempts - ) - redis.call("HSET", rlKey, - "tokens", tokens, - "last_refill_ms", lastRefill - ) - return { 0, "rate_limited", 1000, effectiveRate } +bucketTokens = availTokens - consumedTokens +if generatedTokens > 0 and effectiveRate > 0 then + local generationTime = generatedTokens * 1000 / effectiveRate + bucketRefilledAt = bucketRefilledAt + generationTime end --- Consume one token -tokens = tokens - 1 - -------------------------------------------------------------------------------- --- 4. PERSIST STATE AND ALLOW +-- 3. PERSIST STATE AND RETURN -------------------------------------------------------------------------------- -redis.call("HSET", cbKey, - "opened_until_ms", openedUntil, - "cb_window_from", cbWindowFrom, - "cb_failures", cbFailures, - "cb_attempts", cbAttempts, - "cb_prev_failures", cbPrevFailures, - "cb_prev_attempts", cbPrevAttempts -) -redis.call("HSET", rlKey, - "tokens", tokens, - "last_refill_ms", lastRefill +redis.call("HSET", epKey, + "bucket_tokens", bucketTokens, + "bucket_refilled_at", bucketRefilledAt ) -redis.call("EXPIRE", cbKey, cbTtlSeconds) -redis.call("EXPIRE", rlKey, rlTtlSeconds) +if needInitSwitchedAt then + redis.call("HSET", epKey, "switched_at", switchedAt) +end -return { 1, "allowed", 0, effectiveRate } +local reason = consumedTokens < 1 and "rate_limited" or "allowed" +local retryAfter = consumedTokens < 1 and 1000 or 0 +return { consumedTokens, reason, retryAfter, effectiveRate } diff --git a/lambdas/https-client-lambda/src/services/endpoint-gate.ts b/lambdas/https-client-lambda/src/services/endpoint-gate.ts index c2d85439..bf9c1462 100644 --- a/lambdas/https-client-lambda/src/services/endpoint-gate.ts +++ b/lambdas/https-client-lambda/src/services/endpoint-gate.ts @@ -5,7 +5,7 @@ import recordResultLuaSrc from "services/record-result.lua"; export type AdmitResultAllowed = { allowed: true; - probe: boolean; + consumedTokens: number; effectiveRate: number; }; @@ -24,12 +24,12 @@ export type RecordResultOutcome = export type EndpointGateConfig = { burstCapacity: number; - cbProbeIntervalMs: number; - decayPeriodMs: number; - cbWindowPeriodMs: number; - cbErrorThreshold: number; - cbMinAttempts: number; - cbCooldownMs: number; + probeRateLimit: number; + recoveryPeriodMs: number; + samplePeriodMs: number; + failureThreshold: number; + minAttempts: number; + cooldownPeriodMs: number; }; let admitSha: string | undefined; @@ -76,22 +76,21 @@ export async function admit( targetId: string, refillPerSec: number, cbEnabled: boolean, + targetBatchSize: number, config: EndpointGateConfig, ): Promise { - const cbKey = `cb:{${targetId}}`; - const rlKey = `rl:{${targetId}}`; + const epKey = `ep:{${targetId}}`; const now = Date.now().toString(); - const probeIntervalMs = cbEnabled ? config.cbProbeIntervalMs.toString() : "0"; + const probeRate = cbEnabled ? config.probeRateLimit.toString() : "0"; const args = [ now, config.burstCapacity.toString(), - // eslint-disable-next-line sonarjs/null-dereference - refillPerSec.toString(), - config.cbCooldownMs.toString(), - config.decayPeriodMs.toString(), - config.cbWindowPeriodMs.toString(), - probeIntervalMs, + String(refillPerSec), + config.cooldownPeriodMs.toString(), + config.recoveryPeriodMs.toString(), + probeRate, + String(targetBatchSize), ]; if (!admitSha) { @@ -102,16 +101,16 @@ export async function admit( client, admitLuaSrc, admitSha, - [cbKey, rlKey], + [epKey], args, )) as [number, string, number, number]; - const [allowed, reason, retryAfterMs, effectiveRate] = raw; + const [consumedOrFlag, reason, retryAfterMs, effectiveRate] = raw; - if (allowed === 1) { + if (reason === "allowed" || reason === "probe") { return { allowed: true, - probe: reason === "probe", + consumedTokens: Number(consumedOrFlag), effectiveRate: Number(effectiveRate), }; } @@ -127,20 +126,22 @@ export async function admit( export async function recordResult( client: RedisClientType, targetId: string, - success: boolean, + consumedTokens: number, + processingFailures: number, config: EndpointGateConfig, ): Promise { - const cbKey = `cb:{${targetId}}`; + const epKey = `ep:{${targetId}}`; const now = Date.now().toString(); const args = [ now, - success ? "1" : "0", - config.cbCooldownMs.toString(), - config.decayPeriodMs.toString(), - config.cbErrorThreshold.toString(), - config.cbMinAttempts.toString(), - config.cbWindowPeriodMs.toString(), + String(consumedTokens), + String(processingFailures), + config.cooldownPeriodMs.toString(), + config.recoveryPeriodMs.toString(), + config.failureThreshold.toString(), + config.minAttempts.toString(), + config.samplePeriodMs.toString(), ]; if (!recordResultSha) { @@ -151,7 +152,7 @@ export async function recordResult( client, recordResultLuaSrc, recordResultSha, - [cbKey], + [epKey], args, )) as [number, string]; diff --git a/lambdas/https-client-lambda/src/services/record-result.lua b/lambdas/https-client-lambda/src/services/record-result.lua index 1cc94857..fa3b1b12 100644 --- a/lambdas/https-client-lambda/src/services/record-result.lua +++ b/lambdas/https-client-lambda/src/services/record-result.lua @@ -1,144 +1,150 @@ --- record-result.lua — Records the outcome of a delivery attempt. +-- record-result.lua — Post-processing: updates sampling and circuit breaker. -- --- Updates the circuit breaker's error-rate window counters and opens the --- circuit if the failure rate exceeds the configured threshold. --- --- On success: --- Window counters are left intact. The openedUntil timestamp is preserved --- while the decay period is still active so that admit.lua can continue --- computing the linear ramp-up rate. Once the decay period elapses it --- is zeroed, returning the circuit to a fully clean closed state. --- --- On failure: --- The failure and attempt counters are incremented. A two-window sliding --- blend is computed before evaluating the trip condition: --- slidingAttempts = cbAttempts + cbPrevAttempts * prevWeight --- slidingFailures = cbFailures + cbPrevFailures * prevWeight --- where prevWeight decays linearly from 1.0 → 0.0 as the current window ages, --- so previous-window failures fade out gradually rather than dropping off a cliff. --- The circuit opens when: --- • the endpoint is not already open (prevents double-tripping and --- resetting the cooldown timer prematurely), AND --- • slidingAttempts >= cbMinAttempts (avoids tripping on statistically --- insignificant data at cold start or just after a window roll), AND --- • slidingFailures / slidingAttempts exceeds cbErrorThreshold. --- On open, all counters (current and previous) are reset to zero so the --- fresh cooldown window begins with a clean slate ready for recovery. +-- After processing a batch, this script: +-- 1. Manages the sliding window (rolling forward as necessary) +-- 2. Records new attempts and failures (unless fully open) +-- 3. Interpolates attempt/failure rates using the sliding window +-- 4. Checks whether to close the circuit (half-open + successes) +-- 5. Checks whether to open the circuit (closed + threshold exceeded) -- -- Returns: { ok (0|1), state } -- state: "closed" | "opened" | "failed" +-- Return state constants +local OPENED = "opened" +local CLOSED = "closed" +local FAILED = "failed" + -- Keys -local cbKey = KEYS[1] -- cb:{endpoint} circuit breaker state hash +local epKey = KEYS[1] -- ep:{targetId} combined endpoint state hash -- Arguments -local now = tonumber(ARGV[1]) or 0 -- current wall-clock time (ms) -local success = tonumber(ARGV[2]) or 0 -- 1 = success, 0 = failure -local cooldownMs = tonumber(ARGV[3]) or 0 -- how long the circuit stays open (ms) -local decayPeriodMs = tonumber(ARGV[4]) or 0 -- ramp-up window after circuit closes (ms) -local cbErrorThreshold = tonumber(ARGV[5]) or 0 -- error-rate fraction that trips the circuit (e.g. 0.5) -local cbMinAttempts = tonumber(ARGV[6]) or 0 -- minimum samples before the circuit can trip -local cbWindowPeriodMs = tonumber(ARGV[7]) or 0 -- error-rate sliding window duration (ms) - --- TTL policy: keep circuit breaker state alive for at least the cooldown --- duration plus a buffer so the decay period remains visible after a close. -local cbTtlSeconds = math.ceil(cooldownMs / 1000) + 60 - -local function refreshCbExpiry() - redis.call("EXPIRE", cbKey, cbTtlSeconds) -end +local now = tonumber(ARGV[1]) or 0 +local consumedTokens = tonumber(ARGV[2]) or 0 +local processingFailures = tonumber(ARGV[3]) or 0 +local cooldownPeriodMs = tonumber(ARGV[4]) or 0 +local _recoveryPeriodMs = tonumber(ARGV[5]) or 0 -- luacheck: ignore +local failureThreshold = tonumber(ARGV[6]) or 0 +local minAttempts = tonumber(ARGV[7]) or 0 +local samplePeriodMs = tonumber(ARGV[8]) or 0 -------------------------------------------------------------------------------- -- LOAD CURRENT STATE -------------------------------------------------------------------------------- -local cb = redis.call("HMGET", cbKey, - "opened_until_ms", "cb_window_from", "cb_failures", "cb_attempts", - "cb_prev_failures", "cb_prev_attempts") -local openedUntil = tonumber(cb[1] or "0") -local cbWindowFrom = tonumber(cb[2] or "0") -local cbFailures = tonumber(cb[3] or "0") -local cbAttempts = tonumber(cb[4] or "0") -local cbPrevFailures = tonumber(cb[5] or "0") -local cbPrevAttempts = tonumber(cb[6] or "0") +local state = redis.call("HMGET", epKey, + "is_open", "switched_at", + "cur_attempts", "prev_attempts", "cur_failures", "prev_failures", + "sample_till") +local isOpen = tonumber(state[1] or "0") == 1 +local switchedAt = tonumber(state[2] or tostring(now)) +local curAttempts = tonumber(state[3] or "0") +local prevAttempts = tonumber(state[4] or "0") +local curFailures = tonumber(state[5] or "0") +local prevFailures = tonumber(state[6] or "0") +local sampleTill = tonumber(state[7] or "0") --- Every outcome (success or failure) contributes to the error-rate window -cbAttempts = cbAttempts + 1 +-------------------------------------------------------------------------------- +-- 1. DETERMINE CIRCUIT SUB-STATE +-------------------------------------------------------------------------------- + +local isHalfOpen = isOpen and now > switchedAt + cooldownPeriodMs +local isFullyOpen = isOpen and not isHalfOpen -------------------------------------------------------------------------------- --- SUCCESS — preserve openedUntil during decay, then zero it --- --- admit.lua uses openedUntil to calculate the linear ramp-up rate while the --- decay period is active. That timestamp must survive in Redis until the --- decay period ends. Clearing it prematurely would snap the fill rate back --- to full immediately rather than ramping gradually. +-- 2. MANAGE SLIDING WINDOW -------------------------------------------------------------------------------- -if success == 1 then - -- Keep openedUntil only if we are still within the decay window - local inDecayWindow = openedUntil > 0 and now > openedUntil and (now - openedUntil) < decayPeriodMs - local preservedOpenedUntil = inDecayWindow and openedUntil or 0 - - redis.call("HSET", cbKey, - "opened_until_ms", preservedOpenedUntil, - "cb_window_from", cbWindowFrom, - "cb_failures", cbFailures, - "cb_attempts", cbAttempts, - "cb_prev_failures", cbPrevFailures, - "cb_prev_attempts", cbPrevAttempts - ) - refreshCbExpiry() - return { 1, "closed" } +if sampleTill < now then + if sampleTill + samplePeriodMs < now then + -- Complete reset — window is too old + prevAttempts = 0 + prevFailures = 0 + sampleTill = now + samplePeriodMs + else + -- Promote current to previous + prevAttempts = curAttempts + prevFailures = curFailures + sampleTill = sampleTill + samplePeriodMs + end + curAttempts = 0 + curFailures = 0 end -------------------------------------------------------------------------------- --- FAILURE — increment counter and evaluate whether to open the circuit --- --- The trip condition is evaluated against a sliding blend of current and --- previous window counts, not the raw current-window counts alone. This --- prevents a burst of failures from escaping detection simply because it --- straddles a window boundary and gets partially discarded by a reset. +-- 3. RECORD NEW ATTEMPTS/FAILURES (unless fully open) -------------------------------------------------------------------------------- -cbFailures = cbFailures + 1 - --- The circuit is already open when openedUntil is set and has not yet elapsed. --- Guard against double-tripping, which would reset the cooldown timer early. -local circuitAlreadyOpen = openedUntil > 0 and now < openedUntil - --- Blend current and previous window counts. --- prevWeight decays linearly from 1.0 → 0.0 as the current window ages, --- so previous-window failures fade out gradually rather than dropping off a cliff. -local windowElapsed = cbWindowFrom > 0 and (now - cbWindowFrom) or 0 -local hasWindow = cbWindowPeriodMs > 0 -local prevWeight = hasWindow and math.max(0, (cbWindowPeriodMs - windowElapsed) / cbWindowPeriodMs) or 0 -local slidingFailures = cbFailures + cbPrevFailures * prevWeight -local slidingAttempts = cbAttempts + cbPrevAttempts * prevWeight - -if not circuitAlreadyOpen - and slidingAttempts >= cbMinAttempts -- enough data to be statistically meaningful - and (slidingFailures / slidingAttempts) > cbErrorThreshold then - -- Trip the circuit — reset all counters so recovery starts from a clean slate - redis.call("HSET", cbKey, - "opened_until_ms", now + cooldownMs, - "cb_window_from", 0, - "cb_failures", 0, - "cb_attempts", 0, - "cb_prev_failures", 0, - "cb_prev_attempts", 0 - ) - refreshCbExpiry() - return { 0, "opened" } +if not isFullyOpen then + curAttempts = curAttempts + consumedTokens + curFailures = curFailures + processingFailures +end + +-------------------------------------------------------------------------------- +-- 4. INTERPOLATE VALUES +-------------------------------------------------------------------------------- + +local weight = (sampleTill - now) / samplePeriodMs +local attempts = prevAttempts * weight + curAttempts +local failures = prevFailures * weight + curFailures + +-------------------------------------------------------------------------------- +-- 5. CIRCUIT BREAKER LOGIC +-------------------------------------------------------------------------------- + +local processingSuccesses = consumedTokens - processingFailures +local stateChanged = false + +-- Close circuit when half-open and there are successes +if isHalfOpen and processingSuccesses > 0 then + isOpen = false + switchedAt = now + stateChanged = true + -- fall through, allow circuit to immediately re-open +end + +-- Open circuit when closed, enough samples, and threshold exceeded +local hasSampledEnough = attempts >= minAttempts +if not isOpen and hasSampledEnough and (failures / attempts) > failureThreshold then + isOpen = true + switchedAt = now + curAttempts = 0 + curFailures = 0 + prevAttempts = 0 + prevFailures = 0 + sampleTill = now + samplePeriodMs + stateChanged = true end --- Below the threshold — record the failure but keep the circuit closed -redis.call("HSET", cbKey, - "opened_until_ms", openedUntil, - "cb_window_from", cbWindowFrom, - "cb_failures", cbFailures, - "cb_attempts", cbAttempts, - "cb_prev_failures", cbPrevFailures, - "cb_prev_attempts", cbPrevAttempts +-------------------------------------------------------------------------------- +-- 6. PERSIST STATE +-------------------------------------------------------------------------------- + +redis.call("HSET", epKey, + "cur_attempts", curAttempts, + "prev_attempts", prevAttempts, + "cur_failures", curFailures, + "prev_failures", prevFailures, + "sample_till", sampleTill ) -refreshCbExpiry() -return { 0, "failed" } + +if stateChanged then + redis.call("HSET", epKey, + "is_open", isOpen and 1 or 0, + "switched_at", switchedAt + ) +end + +if stateChanged and isOpen then + return { 0, OPENED } +end + +if stateChanged and not isOpen then + return { 1, CLOSED } +end + +if isOpen or processingFailures > 0 then + return { 0, FAILED } +end + +return { 1, CLOSED } diff --git a/lambdas/perf-runner-lambda/package.json b/lambdas/perf-runner-lambda/package.json index 9f9d01d8..59d7691b 100644 --- a/lambdas/perf-runner-lambda/package.json +++ b/lambdas/perf-runner-lambda/package.json @@ -13,13 +13,17 @@ "typecheck": "tsc --noEmit" }, "dependencies": { + "@aws-crypto/sha256-js": "catalog:aws", "@aws-sdk/client-cloudwatch-logs": "catalog:aws", "@aws-sdk/client-sqs": "catalog:aws", + "@aws-sdk/credential-providers": "catalog:aws", + "@smithy/signature-v4": "catalog:aws", "@nhs-notify-client-callbacks/logger": "workspace:*", "@nhs-notify-client-callbacks/models": "workspace:*", - "esbuild": "catalog:tools" + "@redis/client": "catalog:app" }, "devDependencies": { + "esbuild": "catalog:tools", "@tsconfig/node22": "catalog:tools", "@types/aws-lambda": "catalog:tools", "@types/jest": "catalog:test", diff --git a/lambdas/perf-runner-lambda/src/__tests__/cloudwatch.test.ts b/lambdas/perf-runner-lambda/src/__tests__/cloudwatch.test.ts index 055ac7bc..526de638 100644 --- a/lambdas/perf-runner-lambda/src/__tests__/cloudwatch.test.ts +++ b/lambdas/perf-runner-lambda/src/__tests__/cloudwatch.test.ts @@ -1,5 +1,10 @@ import type { CloudWatchLogsClient } from "@aws-sdk/client-cloudwatch-logs"; -import { queryDeliveryMetricsSnapshot, queryMetricsSnapshot } from "cloudwatch"; +import { + queryCircuitBreakerSnapshot, + queryDeliveryMetricsSnapshot, + queryMetricsSnapshot, + queryPerClientRateTimeline, +} from "cloudwatch"; const mockCloudWatchClient = { send: jest.fn(), @@ -285,3 +290,373 @@ describe("queryDeliveryMetricsSnapshot", () => { expect(result).toBeNull(); }); }); + +describe("queryCircuitBreakerSnapshot", () => { + it("returns null when logGroupNames is empty", async () => { + const result = await queryCircuitBreakerSnapshot( + mockCloudWatchClient, + [], + 0, + 60, + ); + + expect(result).toBeNull(); + expect(mockCloudWatchClient.send).not.toHaveBeenCalled(); + }); + + it("returns null when StartQuery returns no queryId", async () => { + mockCloudWatchClient.send.mockResolvedValueOnce({} as never); + + const result = await queryCircuitBreakerSnapshot( + mockCloudWatchClient, + ["/aws/lambda/test-https-client-perf-client-1"], + 0, + 60, + ); + + expect(result).toBeNull(); + }); + + it("returns a snapshot with zeroed metrics when the result row is empty", async () => { + mockCloudWatchClient.send + .mockResolvedValueOnce({ queryId: "qid-cb1" } as never) + .mockResolvedValueOnce({ status: "Complete", results: [] } as never); + + const promise = queryCircuitBreakerSnapshot( + mockCloudWatchClient, + ["/aws/lambda/test-https-client-perf-client-1"], + 100, + 160, + ); + + await jest.runAllTimersAsync(); + const result = await promise; + + expect(result).toMatchObject({ + intervalStartSec: 100, + intervalEndSec: 160, + circuitOpenEvents: 0, + circuitCloseEvents: 0, + admissionDeniedCircuitOpen: 0, + admissionDeniedRateLimited: 0, + deliveryAttempts: 0, + deliverySuccesses: 0, + deliveryFailures: 0, + deliveryRateLimited: 0, + }); + expect(result?.snapshotAt).toBeGreaterThan(0); + }); + + it("returns a populated snapshot when query completes successfully", async () => { + mockCloudWatchClient.send + .mockResolvedValueOnce({ queryId: "qid-cb2" } as never) + .mockResolvedValueOnce({ + status: "Complete", + results: [ + [ + { field: "circuitOpenEvents", value: "3" }, + { field: "circuitCloseEvents", value: "2" }, + { field: "admissionDeniedCircuitOpen", value: "15" }, + { field: "admissionDeniedRateLimited", value: "8" }, + { field: "deliveryAttempts", value: "200" }, + { field: "deliverySuccesses", value: "180" }, + { field: "deliveryFailures", value: "12" }, + { field: "deliveryRateLimited", value: "8" }, + ], + ], + } as never); + + const promise = queryCircuitBreakerSnapshot( + mockCloudWatchClient, + ["/aws/lambda/test-https-client-perf-client-1"], + 100, + 160, + ); + + await jest.runAllTimersAsync(); + const result = await promise; + + expect(result).toMatchObject({ + intervalStartSec: 100, + intervalEndSec: 160, + circuitOpenEvents: 3, + circuitCloseEvents: 2, + admissionDeniedCircuitOpen: 15, + admissionDeniedRateLimited: 8, + deliveryAttempts: 200, + deliverySuccesses: 180, + deliveryFailures: 12, + deliveryRateLimited: 8, + }); + }); + + it("sends logGroupNames to StartQuery", async () => { + mockCloudWatchClient.send + .mockResolvedValueOnce({ queryId: "qid-cb3" } as never) + .mockResolvedValueOnce({ status: "Complete", results: [] } as never); + + const logGroups = [ + "/aws/lambda/test-https-client-perf-client-1", + "/aws/lambda/test-https-client-perf-client-2", + ]; + + const promise = queryCircuitBreakerSnapshot( + mockCloudWatchClient, + logGroups, + 0, + 60, + ); + + await jest.runAllTimersAsync(); + await promise; + + const startCmd = mockCloudWatchClient.send.mock.calls[0][0] as { + input: { logGroupNames: string[] }; + }; + expect(startCmd.input.logGroupNames).toEqual(logGroups); + }); + + it("returns null when the query status is Failed", async () => { + mockCloudWatchClient.send + .mockResolvedValueOnce({ queryId: "qid-cb4" } as never) + .mockResolvedValueOnce({ status: "Failed" } as never); + + const promise = queryCircuitBreakerSnapshot( + mockCloudWatchClient, + ["/aws/lambda/test-https-client-perf-client-1"], + 0, + 60, + ); + + await jest.runAllTimersAsync(); + const result = await promise; + + expect(result).toBeNull(); + }); +}); + +describe("queryPerClientRateTimeline", () => { + it("returns empty array when StartQuery returns no queryId", async () => { + mockCloudWatchClient.send.mockResolvedValueOnce({} as never); + + const result = await queryPerClientRateTimeline( + mockCloudWatchClient, + "/aws/lambda/test-https-client-perf-client-1", + 0, + 60, + ); + + expect(result).toEqual([]); + }); + + it("returns empty array when the query status is Failed", async () => { + mockCloudWatchClient.send + .mockResolvedValueOnce({ queryId: "qid-pcr1" } as never) + .mockResolvedValueOnce({ status: "Failed" } as never); + + const promise = queryPerClientRateTimeline( + mockCloudWatchClient, + "/aws/lambda/test-https-client-perf-client-1", + 0, + 60, + ); + + await jest.runAllTimersAsync(); + const result = await promise; + + expect(result).toEqual([]); + }); + + it("returns empty array when results are empty", async () => { + mockCloudWatchClient.send + .mockResolvedValueOnce({ queryId: "qid-pcr2" } as never) + .mockResolvedValueOnce({ status: "Complete", results: [] } as never); + + const promise = queryPerClientRateTimeline( + mockCloudWatchClient, + "/aws/lambda/test-https-client-perf-client-1", + 0, + 60, + ); + + await jest.runAllTimersAsync(); + const result = await promise; + + expect(result).toEqual([]); + }); + + it("returns empty array when results is undefined", async () => { + mockCloudWatchClient.send + .mockResolvedValueOnce({ queryId: "qid-pcr2b" } as never) + .mockResolvedValueOnce({ status: "Complete" } as never); + + const promise = queryPerClientRateTimeline( + mockCloudWatchClient, + "/aws/lambda/test-https-client-perf-client-1", + 0, + 60, + ); + + await jest.runAllTimersAsync(); + const result = await promise; + + expect(result).toEqual([]); + }); + + it("defaults missing fields to zero", async () => { + mockCloudWatchClient.send + .mockResolvedValueOnce({ queryId: "qid-pcr2c" } as never) + .mockResolvedValueOnce({ + status: "Complete", + results: [[{ field: "unknownField", value: "123" }]], + } as never); + + const promise = queryPerClientRateTimeline( + mockCloudWatchClient, + "/aws/lambda/test-https-client-perf-client-1", + 0, + 60, + ); + + await jest.runAllTimersAsync(); + const result = await promise; + + expect(result).toHaveLength(1); + expect(result[0].deliveryAttempts).toBe(0); + expect(result[0].timestampSec).toBe( + Math.floor(new Date("0").getTime() / 1000), + ); + }); + + it("returns entries sorted by time bin when query completes", async () => { + mockCloudWatchClient.send + .mockResolvedValueOnce({ queryId: "qid-pcr3" } as never) + .mockResolvedValueOnce({ + status: "Complete", + results: [ + [ + { field: "timeBin", value: "2026-04-09 10:00:00.000" }, + { field: "deliveryAttempts", value: "42" }, + ], + [ + { field: "timeBin", value: "2026-04-09 10:00:10.000" }, + { field: "deliveryAttempts", value: "38" }, + ], + ], + } as never); + + const promise = queryPerClientRateTimeline( + mockCloudWatchClient, + "/aws/lambda/test-https-client-perf-client-1", + 0, + 60, + ); + + await jest.runAllTimersAsync(); + const result = await promise; + + expect(result).toHaveLength(2); + expect(result[0]).toEqual({ + timestampSec: Math.floor( + new Date("2026-04-09 10:00:00.000").getTime() / 1000, + ), + deliveryAttempts: 42, + }); + expect(result[1]).toEqual({ + timestampSec: Math.floor( + new Date("2026-04-09 10:00:10.000").getTime() / 1000, + ), + deliveryAttempts: 38, + }); + }); + + it("sends logGroupName to StartQuery", async () => { + mockCloudWatchClient.send + .mockResolvedValueOnce({ queryId: "qid-pcr4" } as never) + .mockResolvedValueOnce({ status: "Complete", results: [] } as never); + + const promise = queryPerClientRateTimeline( + mockCloudWatchClient, + "/aws/lambda/test-https-client-perf-client-1", + 100, + 200, + ); + + await jest.runAllTimersAsync(); + await promise; + + const startCmd = mockCloudWatchClient.send.mock.calls[0][0] as { + input: { logGroupName: string; startTime: number; endTime: number }; + }; + expect(startCmd.input.logGroupName).toBe( + "/aws/lambda/test-https-client-perf-client-1", + ); + expect(startCmd.input.startTime).toBe(100); + expect(startCmd.input.endTime).toBe(200); + }); + + it("polls until the query becomes Complete", async () => { + mockCloudWatchClient.send + .mockResolvedValueOnce({ queryId: "qid-pcr5" } as never) + .mockResolvedValueOnce({ status: "Running" } as never) + .mockResolvedValueOnce({ + status: "Complete", + results: [ + [ + { field: "timeBin", value: "2026-04-09 10:00:00.000" }, + { field: "deliveryAttempts", value: "5" }, + ], + ], + } as never); + + const promise = queryPerClientRateTimeline( + mockCloudWatchClient, + "/aws/lambda/test-https-client-perf-client-1", + 0, + 60, + ); + + await jest.runAllTimersAsync(); + const result = await promise; + + expect(result).toHaveLength(1); + expect(result[0].deliveryAttempts).toBe(5); + expect(mockCloudWatchClient.send).toHaveBeenCalledTimes(3); + }); + + it("returns empty array when the query does not complete within the timeout", async () => { + mockCloudWatchClient.send + .mockResolvedValueOnce({ queryId: "qid-pcr6" } as never) + .mockResolvedValue({ status: "Running" } as never); + + const promise = queryPerClientRateTimeline( + mockCloudWatchClient, + "/aws/lambda/test-https-client-perf-client-1", + 0, + 60, + ); + + await jest.advanceTimersByTimeAsync(60_000); + const result = await promise; + + expect(result).toEqual([]); + }); + + it("returns empty array when the query status is Cancelled", async () => { + mockCloudWatchClient.send + .mockResolvedValueOnce({ queryId: "qid-pcr7" } as never) + .mockResolvedValueOnce({ status: "Cancelled" } as never); + + const promise = queryPerClientRateTimeline( + mockCloudWatchClient, + "/aws/lambda/test-https-client-perf-client-1", + 0, + 60, + ); + + await jest.runAllTimersAsync(); + const result = await promise; + + expect(result).toEqual([]); + }); +}); diff --git a/lambdas/perf-runner-lambda/src/__tests__/elasticache.test.ts b/lambdas/perf-runner-lambda/src/__tests__/elasticache.test.ts new file mode 100644 index 00000000..09846ed3 --- /dev/null +++ b/lambdas/perf-runner-lambda/src/__tests__/elasticache.test.ts @@ -0,0 +1,74 @@ +import { flushElastiCache } from "elasticache"; +import type { ElastiCacheDeps } from "types"; + +const mockConnect = jest.fn().mockResolvedValue(undefined); +const mockFlushAll = jest.fn().mockResolvedValue("OK"); +const mockDisconnect = jest.fn().mockResolvedValue(undefined); +let mockIsOpen = true; + +jest.mock("@redis/client", () => ({ + createClient: jest.fn(() => ({ + connect: mockConnect, + flushAll: mockFlushAll, + disconnect: mockDisconnect, + get isOpen() { + return mockIsOpen; + }, + })), +})); + +jest.mock("@smithy/signature-v4", () => ({ + SignatureV4: jest.fn(() => ({ + presign: jest.fn().mockResolvedValue({ + query: { + "X-Amz-Algorithm": "AWS4-HMAC-SHA256", + "X-Amz-Credential": "test-credential", + }, + }), + })), +})); + +jest.mock("@aws-crypto/sha256-js", () => ({ + Sha256: jest.fn(), +})); + +jest.mock("@aws-sdk/credential-providers", () => ({ + fromNodeProviderChain: jest.fn(() => ({})), +})); + +const deps: ElastiCacheDeps = { + endpoint: "test-cache.example.invalid", + cacheName: "test-cache", + iamUsername: "test-user", + region: "eu-west-2", +}; + +beforeEach(() => { + jest.clearAllMocks(); + mockIsOpen = true; +}); + +describe("flushElastiCache", () => { + it("connects, flushes all keys, and disconnects", async () => { + await flushElastiCache(deps); + + expect(mockConnect).toHaveBeenCalledTimes(1); + expect(mockFlushAll).toHaveBeenCalledTimes(1); + expect(mockDisconnect).toHaveBeenCalledTimes(1); + }); + + it("disconnects even when flushAll throws", async () => { + mockFlushAll.mockRejectedValueOnce(new Error("flush failed")); + + await expect(flushElastiCache(deps)).rejects.toThrow("flush failed"); + expect(mockDisconnect).toHaveBeenCalledTimes(1); + }); + + it("skips disconnect when client is not open", async () => { + mockIsOpen = false; + + await flushElastiCache(deps); + + expect(mockDisconnect).not.toHaveBeenCalled(); + }); +}); diff --git a/lambdas/perf-runner-lambda/src/__tests__/index.test.ts b/lambdas/perf-runner-lambda/src/__tests__/index.test.ts index 1d1a501a..3c33bfd6 100644 --- a/lambdas/perf-runner-lambda/src/__tests__/index.test.ts +++ b/lambdas/perf-runner-lambda/src/__tests__/index.test.ts @@ -32,6 +32,7 @@ const mockResult: PerformanceResult = { phases: [], metrics: [], deliveryMetrics: [], + circuitBreakerMetrics: [], }; beforeEach(() => { @@ -42,6 +43,11 @@ beforeEach(() => { "/aws/lambda/nhs-dev-callbacks-client-transform-filter"; process.env.DELIVERY_LOG_GROUP_PREFIX = "/aws/lambda/nhs-dev-callbacks-https-client-"; + process.env.MOCK_WEBHOOK_LOG_GROUP = + "/aws/lambda/nhs-dev-callbacks-mock-webhook"; + process.env.ELASTICACHE_ENDPOINT = "cache.example.invalid"; + process.env.ELASTICACHE_CACHE_NAME = "test-cache"; + process.env.ELASTICACHE_IAM_USERNAME = "test-user"; process.env.AWS_REGION = "eu-west-2"; }); @@ -55,9 +61,17 @@ describe("handler", () => { queueUrl: "https://sqs.example.invalid/queue", logGroupName: "/aws/lambda/nhs-dev-callbacks-client-transform-filter", deliveryLogGroupPrefix: "/aws/lambda/nhs-dev-callbacks-https-client-", + mockWebhookLogGroup: "/aws/lambda/nhs-dev-callbacks-mock-webhook", }), DEFAULT_SCENARIO, "test-id", + undefined, + expect.objectContaining({ + endpoint: "cache.example.invalid", + cacheName: "test-cache", + iamUsername: "test-user", + region: "eu-west-2", + }), ); }); @@ -73,6 +87,8 @@ describe("handler", () => { expect.anything(), customScenario, "custom-test", + undefined, + expect.anything(), ); }); @@ -117,6 +133,38 @@ describe("handler", () => { }), DEFAULT_SCENARIO, "no-prefix-test", + undefined, + expect.anything(), + ); + }); + + it("passes undefined elastiCacheDeps when ElastiCache env vars are missing", async () => { + delete process.env.ELASTICACHE_ENDPOINT; + delete process.env.ELASTICACHE_CACHE_NAME; + delete process.env.ELASTICACHE_IAM_USERNAME; + + await handler({ testId: "no-cache-test" }); + + expect(mockRunPerformanceTest).toHaveBeenCalledWith( + expect.anything(), + DEFAULT_SCENARIO, + "no-cache-test", + undefined, + undefined, + ); + }); + + it("passes mockWebhookLogGroup from env var", async () => { + await handler({ testId: "webhook-test" }); + + expect(mockRunPerformanceTest).toHaveBeenCalledWith( + expect.objectContaining({ + mockWebhookLogGroup: "/aws/lambda/nhs-dev-callbacks-mock-webhook", + }), + expect.anything(), + "webhook-test", + undefined, + expect.anything(), ); }); }); diff --git a/lambdas/perf-runner-lambda/src/__tests__/purge.test.ts b/lambdas/perf-runner-lambda/src/__tests__/purge.test.ts new file mode 100644 index 00000000..60347ef9 --- /dev/null +++ b/lambdas/perf-runner-lambda/src/__tests__/purge.test.ts @@ -0,0 +1,116 @@ +import type { SQSClient } from "@aws-sdk/client-sqs"; +import { deriveQueueUrls, purgeQueues } from "purge"; +import type { Scenario } from "types"; + +const scenario: Scenario = { + phases: [{ durationSecs: 1, targetEps: 10 }], + eventMix: [ + { + weight: 1, + factory: "messageStatus", + clientId: "perf-client-1", + messageStatus: "DELIVERED", + }, + { + weight: 1, + factory: "channelStatus", + clientId: "perf-client-2", + channelStatus: "DELIVERED", + }, + ], + metricsIntervalSecs: 5, +}; + +const inboundQueueUrl = + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-inbound-event-queue"; + +describe("deriveQueueUrls", () => { + it("derives all queue URLs from the inbound queue URL and scenario", () => { + const urls = deriveQueueUrls(inboundQueueUrl, scenario); + + expect(urls).toEqual([ + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-inbound-event-queue", + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-inbound-event-dlq-queue", + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-perf-client-1-delivery-queue", + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-perf-client-1-delivery-dlq-queue", + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-perf-client-2-delivery-queue", + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-perf-client-2-delivery-dlq-queue", + ]); + }); + + it("deduplicates client IDs that appear multiple times in eventMix", () => { + const duplicateScenario: Scenario = { + ...scenario, + eventMix: [ + { + weight: 1, + factory: "messageStatus", + clientId: "perf-client-1", + messageStatus: "DELIVERED", + }, + { + weight: 1, + factory: "channelStatus", + clientId: "perf-client-1", + channelStatus: "DELIVERED", + }, + ], + }; + + const urls = deriveQueueUrls(inboundQueueUrl, duplicateScenario); + + expect(urls).toEqual([ + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-inbound-event-queue", + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-inbound-event-dlq-queue", + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-perf-client-1-delivery-queue", + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-perf-client-1-delivery-dlq-queue", + ]); + }); +}); + +describe("purgeQueues", () => { + const mockSend = jest.fn().mockResolvedValue({}); + const mockSqsClient = { send: mockSend } as unknown as SQSClient; + + beforeEach(() => { + jest.clearAllMocks(); + mockSend.mockResolvedValue({}); + }); + + it("sends a PurgeQueueCommand for each queue URL", async () => { + const urls = [ + "https://sqs.example.invalid/queue-a", + "https://sqs.example.invalid/queue-b", + ]; + + await purgeQueues(mockSqsClient, urls); + + expect(mockSend).toHaveBeenCalledTimes(2); + }); + + it("ignores NonExistentQueue errors gracefully", async () => { + const nonExistentError = Object.assign(new Error("Queue does not exist"), { + name: "AWS.SimpleQueueService.NonExistentQueue", + }); + mockSend.mockRejectedValueOnce(nonExistentError); + + await expect( + purgeQueues(mockSqsClient, ["https://sqs.example.invalid/missing"]), + ).resolves.toBeUndefined(); + }); + + it("rethrows non-NonExistentQueue errors", async () => { + const otherError = new Error("Access denied"); + mockSend.mockRejectedValueOnce(otherError); + + await expect( + purgeQueues(mockSqsClient, ["https://sqs.example.invalid/queue"]), + ).rejects.toThrow("Access denied"); + }); + + it("handles an empty queue URL list without sending any commands", async () => { + await purgeQueues(mockSqsClient, []); + + expect(mockSend).not.toHaveBeenCalled(); + }); +}); diff --git a/lambdas/perf-runner-lambda/src/__tests__/runner.test.ts b/lambdas/perf-runner-lambda/src/__tests__/runner.test.ts index 1cf5f3a3..46a0928d 100644 --- a/lambdas/perf-runner-lambda/src/__tests__/runner.test.ts +++ b/lambdas/perf-runner-lambda/src/__tests__/runner.test.ts @@ -1,6 +1,7 @@ import type { SQSClient } from "@aws-sdk/client-sqs"; import type { CloudWatchLogsClient } from "@aws-sdk/client-cloudwatch-logs"; import type { + CircuitBreakerSnapshot, DeliveryMetricsSnapshot, MetricsSnapshot, PhaseResult, @@ -10,16 +11,35 @@ import type { import { defaultSleep, runPerformanceTest } from "runner"; import { generatePhaseLoad } from "sqs"; -import { queryDeliveryMetricsSnapshot, queryMetricsSnapshot } from "cloudwatch"; +import { deriveQueueUrls, purgeQueues } from "purge"; +import { flushElastiCache } from "elasticache"; +import { verifyMockWebhook } from "webhook-verify"; +import { + queryCircuitBreakerSnapshot, + queryDeliveryMetricsSnapshot, + queryMetricsSnapshot, + queryPerClientRateTimeline, +} from "cloudwatch"; jest.mock("sqs"); jest.mock("cloudwatch"); +jest.mock("purge"); +jest.mock("elasticache"); +jest.mock("webhook-verify"); const mockGeneratePhaseLoad = jest.mocked(generatePhaseLoad); const mockQueryMetricsSnapshot = jest.mocked(queryMetricsSnapshot); const mockQueryDeliveryMetricsSnapshot = jest.mocked( queryDeliveryMetricsSnapshot, ); +const mockQueryCircuitBreakerSnapshot = jest.mocked( + queryCircuitBreakerSnapshot, +); +const mockQueryPerClientRateTimeline = jest.mocked(queryPerClientRateTimeline); +const mockDeriveQueueUrls = jest.mocked(deriveQueueUrls); +const mockPurgeQueues = jest.mocked(purgeQueues); +const mockFlushElastiCache = jest.mocked(flushElastiCache); +const mockVerifyMockWebhook = jest.mocked(verifyMockWebhook); const immediateSleep = jest.fn().mockResolvedValue(undefined); @@ -46,6 +66,20 @@ const mockDeliverySnapshot: DeliveryMetricsSnapshot = { p99Ms: 500, }; +const mockCbSnapshot: CircuitBreakerSnapshot = { + snapshotAt: Date.now(), + intervalStartSec: 0, + intervalEndSec: 60, + circuitOpenEvents: 1, + circuitCloseEvents: 0, + admissionDeniedCircuitOpen: 5, + admissionDeniedRateLimited: 3, + deliveryAttempts: 100, + deliverySuccesses: 92, + deliveryFailures: 5, + deliveryRateLimited: 3, +}; + const scenario: Scenario = { phases: [{ durationSecs: 1, targetEps: 1000 }], eventMix: [ @@ -71,6 +105,17 @@ beforeEach(() => { jest.clearAllMocks(); mockGeneratePhaseLoad.mockResolvedValue(mockPhaseResult); mockQueryDeliveryMetricsSnapshot.mockResolvedValue(null); + mockQueryCircuitBreakerSnapshot.mockResolvedValue(null); + mockQueryPerClientRateTimeline.mockResolvedValue([]); + mockDeriveQueueUrls.mockReturnValue([ + "https://sqs.example.invalid/inbound-event-queue", + ]); + mockPurgeQueues.mockResolvedValue(undefined); + mockFlushElastiCache.mockResolvedValue(undefined); + mockVerifyMockWebhook.mockResolvedValue({ + receivedCallbacks: 0, + verified: false, + }); immediateSleep.mockResolvedValue(undefined); }); @@ -78,6 +123,7 @@ describe("runPerformanceTest", () => { it("returns a PerformanceResult with phase results and snapshots from polling and final query", async () => { mockQueryMetricsSnapshot.mockResolvedValue(mockSnapshot); mockQueryDeliveryMetricsSnapshot.mockResolvedValue(mockDeliverySnapshot); + mockQueryCircuitBreakerSnapshot.mockResolvedValue(mockCbSnapshot); const result = await runPerformanceTest( deps, @@ -92,6 +138,7 @@ describe("runPerformanceTest", () => { expect(result.phases[0]).toEqual(mockPhaseResult); expect(result.metrics).toHaveLength(2); // one mid-test, one final expect(result.deliveryMetrics).toHaveLength(2); // one mid-test, one final + expect(result.circuitBreakerMetrics).toHaveLength(2); // one mid-test, one final expect(result.startedAt).toBeTruthy(); expect(result.completedAt).toBeTruthy(); }); @@ -111,6 +158,7 @@ describe("runPerformanceTest", () => { expect(result.metrics).toHaveLength(1); expect(result.metrics[0]).toEqual(mockSnapshot); expect(result.deliveryMetrics).toHaveLength(0); + expect(result.circuitBreakerMetrics).toHaveLength(0); }); it("produces an empty metrics array when all queries return null", async () => { @@ -125,6 +173,7 @@ describe("runPerformanceTest", () => { expect(result.metrics).toHaveLength(0); expect(result.deliveryMetrics).toHaveLength(0); + expect(result.circuitBreakerMetrics).toHaveLength(0); }); it("runs all phases and collects each result", async () => { @@ -267,7 +316,9 @@ describe("runPerformanceTest", () => { ); expect(mockQueryDeliveryMetricsSnapshot).not.toHaveBeenCalled(); + expect(mockQueryCircuitBreakerSnapshot).not.toHaveBeenCalled(); expect(result.deliveryMetrics).toHaveLength(0); + expect(result.circuitBreakerMetrics).toHaveLength(0); }); it("builds delivery log group names from prefix and event mix client IDs", async () => { @@ -309,6 +360,273 @@ describe("runPerformanceTest", () => { expect.any(Number), ); }); + + it("collects circuit breaker metrics when deliveryLogGroupPrefix is set", async () => { + mockQueryMetricsSnapshot.mockResolvedValue(mockSnapshot); + mockQueryDeliveryMetricsSnapshot.mockResolvedValue(mockDeliverySnapshot); + mockQueryCircuitBreakerSnapshot.mockResolvedValue(mockCbSnapshot); + + const result = await runPerformanceTest( + deps, + scenario, + "test-cb-1", + immediateSleep, + ); + + expect(result.circuitBreakerMetrics.length).toBeGreaterThanOrEqual(1); + expect(mockQueryCircuitBreakerSnapshot).toHaveBeenCalled(); + }); + + it("returns empty circuitBreakerMetrics when CB queries return null", async () => { + mockQueryMetricsSnapshot.mockResolvedValue(mockSnapshot); + mockQueryDeliveryMetricsSnapshot.mockResolvedValue(mockDeliverySnapshot); + mockQueryCircuitBreakerSnapshot.mockResolvedValue(null); + + const result = await runPerformanceTest( + deps, + scenario, + "test-cb-null", + immediateSleep, + ); + + expect(result.circuitBreakerMetrics).toHaveLength(0); + }); + + it("uses per-interval windowing for circuit breaker snapshots", async () => { + mockQueryMetricsSnapshot.mockResolvedValue(mockSnapshot); + mockQueryDeliveryMetricsSnapshot.mockResolvedValue(mockDeliverySnapshot); + mockQueryCircuitBreakerSnapshot.mockResolvedValue(mockCbSnapshot); + + let resolvePhase!: (value: PhaseResult) => void; + mockGeneratePhaseLoad.mockImplementation( + () => + new Promise((r) => { + resolvePhase = r; + }), + ); + + let sleepCount = 0; + const controlledSleep = jest.fn(async () => { + sleepCount += 1; + if (sleepCount >= 3) { + resolvePhase(mockPhaseResult); + } + }); + + await runPerformanceTest( + deps, + scenario, + "test-cb-interval", + controlledSleep, + ); + + const cbCalls = mockQueryCircuitBreakerSnapshot.mock.calls; + expect(cbCalls.length).toBeGreaterThanOrEqual(2); + const firstCallEndSec = cbCalls[0][3]; + const secondCallStartSec = cbCalls[1][2]; + expect(secondCallStartSec).toBe(firstCallEndSec); + }); + + it("collects per-client rate timelines when deliveryLogGroupPrefix is set", async () => { + mockQueryMetricsSnapshot.mockResolvedValue(mockSnapshot); + mockQueryDeliveryMetricsSnapshot.mockResolvedValue(mockDeliverySnapshot); + mockQueryPerClientRateTimeline.mockResolvedValue([ + { timestampSec: 1000, deliveryAttempts: 10 }, + ]); + + const result = await runPerformanceTest( + deps, + scenario, + "test-pcr-1", + immediateSleep, + ); + + expect(result.perClientRateTimelines).toHaveLength(1); + expect(result.perClientRateTimelines![0].clientId).toBe("perf-client-1"); + expect(result.perClientRateTimelines![0].entries).toHaveLength(1); + }); + + it("queries each client log group individually for rate timelines", async () => { + mockQueryMetricsSnapshot.mockResolvedValue(null); + mockQueryPerClientRateTimeline.mockResolvedValue([ + { timestampSec: 1000, deliveryAttempts: 5 }, + ]); + + const multiClientScenario: Scenario = { + ...scenario, + eventMix: [ + { + weight: 1, + factory: "messageStatus", + clientId: "perf-client-1", + messageStatus: "DELIVERED", + }, + { + weight: 1, + factory: "channelStatus", + clientId: "perf-client-2", + channelStatus: "DELIVERED", + }, + ], + }; + + const result = await runPerformanceTest( + deps, + multiClientScenario, + "test-pcr-multi", + immediateSleep, + ); + + expect(mockQueryPerClientRateTimeline).toHaveBeenCalledTimes(2); + expect(mockQueryPerClientRateTimeline).toHaveBeenCalledWith( + deps.cloudWatchClient, + "/aws/lambda/nhs-dev-callbacks-https-client-perf-client-1", + expect.any(Number), + expect.any(Number), + ); + expect(mockQueryPerClientRateTimeline).toHaveBeenCalledWith( + deps.cloudWatchClient, + "/aws/lambda/nhs-dev-callbacks-https-client-perf-client-2", + expect.any(Number), + expect.any(Number), + ); + expect(result.perClientRateTimelines).toHaveLength(2); + }); + + it("excludes clients with empty rate timelines", async () => { + mockQueryMetricsSnapshot.mockResolvedValue(null); + mockQueryPerClientRateTimeline + .mockResolvedValueOnce([{ timestampSec: 1000, deliveryAttempts: 5 }]) + .mockResolvedValueOnce([]); + + const multiClientScenario: Scenario = { + ...scenario, + eventMix: [ + { + weight: 1, + factory: "messageStatus", + clientId: "perf-client-1", + messageStatus: "DELIVERED", + }, + { + weight: 1, + factory: "channelStatus", + clientId: "perf-client-2", + channelStatus: "DELIVERED", + }, + ], + }; + + const result = await runPerformanceTest( + deps, + multiClientScenario, + "test-pcr-filter", + immediateSleep, + ); + + expect(result.perClientRateTimelines).toHaveLength(1); + expect(result.perClientRateTimelines![0].clientId).toBe("perf-client-1"); + }); + + it("skips per-client rate timelines when deliveryLogGroupPrefix is undefined", async () => { + const depsWithoutPrefix: RunnerDeps = { + ...deps, + deliveryLogGroupPrefix: undefined, + }; + mockQueryMetricsSnapshot.mockResolvedValue(mockSnapshot); + + const result = await runPerformanceTest( + depsWithoutPrefix, + scenario, + "test-pcr-skip", + immediateSleep, + ); + + expect(mockQueryPerClientRateTimeline).not.toHaveBeenCalled(); + expect(result.perClientRateTimelines).toHaveLength(0); + }); + + it("purges queues before and after the test run", async () => { + mockQueryMetricsSnapshot.mockResolvedValue(null); + + await runPerformanceTest(deps, scenario, "test-purge", immediateSleep); + + expect(mockDeriveQueueUrls).toHaveBeenCalledWith(deps.queueUrl, scenario); + expect(mockPurgeQueues).toHaveBeenCalledTimes(2); + }); + + it("flushes ElastiCache before and after when deps are provided", async () => { + mockQueryMetricsSnapshot.mockResolvedValue(null); + const elastiCacheDeps = { + endpoint: "cache.example.invalid", + cacheName: "test-cache", + iamUsername: "test-user", + region: "eu-west-2", + }; + + await runPerformanceTest( + deps, + scenario, + "test-flush", + immediateSleep, + elastiCacheDeps, + ); + + expect(mockFlushElastiCache).toHaveBeenCalledTimes(2); + expect(mockFlushElastiCache).toHaveBeenCalledWith(elastiCacheDeps); + }); + + it("skips ElastiCache flush when deps are not provided", async () => { + mockQueryMetricsSnapshot.mockResolvedValue(null); + + await runPerformanceTest(deps, scenario, "test-no-flush", immediateSleep); + + expect(mockFlushElastiCache).not.toHaveBeenCalled(); + }); + + it("verifies mock webhook when log group is configured", async () => { + const depsWithWebhook: RunnerDeps = { + ...deps, + mockWebhookLogGroup: "/aws/lambda/test-mock-webhook", + }; + mockQueryMetricsSnapshot.mockResolvedValue(null); + mockVerifyMockWebhook.mockResolvedValue({ + receivedCallbacks: 25, + verified: true, + }); + + const result = await runPerformanceTest( + depsWithWebhook, + scenario, + "test-webhook", + immediateSleep, + ); + + expect(mockVerifyMockWebhook).toHaveBeenCalledWith( + depsWithWebhook.cloudWatchClient, + "/aws/lambda/test-mock-webhook", + expect.any(Number), + expect.any(Number), + ); + expect(result.webhookVerification).toEqual({ + receivedCallbacks: 25, + verified: true, + }); + }); + + it("omits webhook verification when log group is not configured", async () => { + mockQueryMetricsSnapshot.mockResolvedValue(null); + + const result = await runPerformanceTest( + deps, + scenario, + "test-no-webhook", + immediateSleep, + ); + + expect(mockVerifyMockWebhook).not.toHaveBeenCalled(); + expect(result.webhookVerification).toBeUndefined(); + }); }); describe("defaultSleep", () => { diff --git a/lambdas/perf-runner-lambda/src/__tests__/webhook-verify.test.ts b/lambdas/perf-runner-lambda/src/__tests__/webhook-verify.test.ts new file mode 100644 index 00000000..72c49870 --- /dev/null +++ b/lambdas/perf-runner-lambda/src/__tests__/webhook-verify.test.ts @@ -0,0 +1,173 @@ +import type { CloudWatchLogsClient } from "@aws-sdk/client-cloudwatch-logs"; +import { verifyMockWebhook } from "webhook-verify"; + +const mockSend = jest.fn(); +const mockClient = { send: mockSend } as unknown as CloudWatchLogsClient; + +beforeEach(() => { + jest.clearAllMocks(); +}); + +describe("verifyMockWebhook", () => { + it("returns verified=true when callbacks are found", async () => { + mockSend.mockResolvedValueOnce({ queryId: "q-1" }).mockResolvedValueOnce({ + status: "Complete", + results: [[{ field: "callbackCount", value: "42" }]], + }); + + const result = await verifyMockWebhook( + mockClient, + "/aws/lambda/test-mock-webhook", + 1000, + 2000, + ); + + expect(result).toEqual({ receivedCallbacks: 42, verified: true }); + }); + + it("returns verified=false when no callbacks are found", async () => { + mockSend.mockResolvedValueOnce({ queryId: "q-2" }).mockResolvedValueOnce({ + status: "Complete", + results: [[{ field: "callbackCount", value: "0" }]], + }); + + const result = await verifyMockWebhook( + mockClient, + "/aws/lambda/test-mock-webhook", + 1000, + 2000, + ); + + expect(result).toEqual({ receivedCallbacks: 0, verified: false }); + }); + + it("returns verified=false when query fails", async () => { + mockSend + .mockResolvedValueOnce({ queryId: "q-3" }) + .mockResolvedValueOnce({ status: "Failed" }); + + const result = await verifyMockWebhook( + mockClient, + "/aws/lambda/test-mock-webhook", + 1000, + 2000, + ); + + expect(result).toEqual({ receivedCallbacks: 0, verified: false }); + }); + + it("returns verified=false when no queryId is returned", async () => { + mockSend.mockResolvedValueOnce({}); + + const result = await verifyMockWebhook( + mockClient, + "/aws/lambda/test-mock-webhook", + 1000, + 2000, + ); + + expect(result).toEqual({ receivedCallbacks: 0, verified: false }); + }); + + it("returns verified=false when results are empty", async () => { + mockSend.mockResolvedValueOnce({ queryId: "q-4" }).mockResolvedValueOnce({ + status: "Complete", + results: [], + }); + + const result = await verifyMockWebhook( + mockClient, + "/aws/lambda/test-mock-webhook", + 1000, + 2000, + ); + + expect(result).toEqual({ receivedCallbacks: 0, verified: false }); + }); + + it("returns verified=false when results field is undefined", async () => { + mockSend.mockResolvedValueOnce({ queryId: "q-4b" }).mockResolvedValueOnce({ + status: "Complete", + results: undefined, + }); + + const result = await verifyMockWebhook( + mockClient, + "/aws/lambda/test-mock-webhook", + 1000, + 2000, + ); + + expect(result).toEqual({ receivedCallbacks: 0, verified: false }); + }); + + it("polls until the query completes", async () => { + mockSend + .mockResolvedValueOnce({ queryId: "q-5" }) + .mockResolvedValueOnce({ status: "Running" }) + .mockResolvedValueOnce({ + status: "Complete", + results: [[{ field: "callbackCount", value: "10" }]], + }); + + const result = await verifyMockWebhook( + mockClient, + "/aws/lambda/test-mock-webhook", + 1000, + 2000, + ); + + expect(result).toEqual({ receivedCallbacks: 10, verified: true }); + expect(mockSend).toHaveBeenCalledTimes(3); + }); + + it("returns verified=false when query is cancelled", async () => { + mockSend + .mockResolvedValueOnce({ queryId: "q-6" }) + .mockResolvedValueOnce({ status: "Cancelled" }); + + const result = await verifyMockWebhook( + mockClient, + "/aws/lambda/test-mock-webhook", + 1000, + 2000, + ); + + expect(result).toEqual({ receivedCallbacks: 0, verified: false }); + }); + + it("returns verified=false when polling times out", async () => { + jest.useFakeTimers(); + + mockSend.mockResolvedValueOnce({ queryId: "q-7" }).mockImplementation( + () => + new Promise((resolve) => { + setTimeout(() => resolve({ status: "Running" }), 1000); + }), + ); + + const originalDateNow = Date.now; + let callCount = 0; + jest.spyOn(Date, "now").mockImplementation(() => { + callCount += 1; + if (callCount <= 1) return originalDateNow.call(Date); + return originalDateNow.call(Date) + 60_000; + }); + + const promise = verifyMockWebhook( + mockClient, + "/aws/lambda/test-mock-webhook", + 1000, + 2000, + ); + + await jest.advanceTimersByTimeAsync(60_000); + + const result = await promise; + + expect(result).toEqual({ receivedCallbacks: 0, verified: false }); + + jest.useRealTimers(); + jest.restoreAllMocks(); + }); +}); diff --git a/lambdas/perf-runner-lambda/src/cloudwatch.ts b/lambdas/perf-runner-lambda/src/cloudwatch.ts index 206bec33..598f5f3f 100644 --- a/lambdas/perf-runner-lambda/src/cloudwatch.ts +++ b/lambdas/perf-runner-lambda/src/cloudwatch.ts @@ -3,19 +3,22 @@ import { GetQueryResultsCommand, StartQueryCommand, } from "@aws-sdk/client-cloudwatch-logs"; -import type { DeliveryMetricsSnapshot, MetricsSnapshot } from "types"; +import type { + CircuitBreakerSnapshot, + DeliveryMetricsSnapshot, + MetricsSnapshot, + PerClientRateEntry, +} from "types"; const INSIGHTS_POLL_INTERVAL_MS = 2000; const INSIGHTS_TIMEOUT_MS = 30_000; type ResultField = { field?: string; value?: string }; -async function pollQueryResults( +async function pollInsightsQuery( client: CloudWatchLogsClient, queryId: string, - mapRow: (row: ResultField[]) => T, -): Promise { - const zeroResult = mapRow([]); +): Promise { const deadline = Date.now() + INSIGHTS_TIMEOUT_MS; while (Date.now() < deadline) { @@ -30,15 +33,33 @@ async function pollQueryResults( } if (response.status === "Complete") { - const row = response.results?.[0]; - if (!row) return zeroResult; - return mapRow(row); + return (response.results as ResultField[][]) ?? []; } } return null; } +async function pollQueryResults( + client: CloudWatchLogsClient, + queryId: string, + mapRow: (row: ResultField[]) => T, +): Promise { + const rows = await pollInsightsQuery(client, queryId); + if (rows === null) return null; + return mapRow(rows[0] ?? []); +} + +async function pollAllQueryResults( + client: CloudWatchLogsClient, + queryId: string, + mapRow: (row: ResultField[]) => T, +): Promise { + const rows = await pollInsightsQuery(client, queryId); + if (rows === null) return []; + return rows.map((row) => mapRow(row)); +} + export async function queryMetricsSnapshot( client: CloudWatchLogsClient, logGroupName: string, @@ -108,3 +129,86 @@ export async function queryDeliveryMetricsSnapshot( }; }); } + +export async function queryCircuitBreakerSnapshot( + client: CloudWatchLogsClient, + logGroupNames: string[], + startTimeSec: number, + endTimeSec: number, +): Promise { + if (logGroupNames.length === 0) return null; + + const { queryId } = await client.send( + new StartQueryCommand({ + logGroupNames, + startTime: startTimeSec, + endTime: endTimeSec, + queryString: [ + 'filter msg in ["Circuit breaker opened", "Circuit breaker closed", "Admission denied", "Attempting delivery", "Delivery succeeded", "Transient delivery failure \u2014 requeuing", "Permanent delivery failure \u2014 sending to DLQ", "Rate limited (429)"]', + '| stats sum(msg = "Circuit breaker opened") as circuitOpenEvents,', + ' sum(msg = "Circuit breaker closed") as circuitCloseEvents,', + ' sum(msg = "Admission denied" and reason = "circuit_open") as admissionDeniedCircuitOpen,', + ' sum(msg = "Admission denied" and reason = "rate_limited") as admissionDeniedRateLimited,', + ' sum(msg = "Attempting delivery") as deliveryAttempts,', + ' sum(msg = "Delivery succeeded") as deliverySuccesses,', + ' sum(msg in ["Transient delivery failure \u2014 requeuing", "Permanent delivery failure \u2014 sending to DLQ"]) as deliveryFailures,', + ' sum(msg = "Rate limited (429)") as deliveryRateLimited', + ].join("\n"), + }), + ); + + if (!queryId) return null; + + return pollQueryResults(client, queryId, (row) => { + const getField = (name: string): number => + Number(row.find((f) => f.field === name)?.value ?? 0); + + return { + snapshotAt: Date.now(), + intervalStartSec: startTimeSec, + intervalEndSec: endTimeSec, + circuitOpenEvents: getField("circuitOpenEvents"), + circuitCloseEvents: getField("circuitCloseEvents"), + admissionDeniedCircuitOpen: getField("admissionDeniedCircuitOpen"), + admissionDeniedRateLimited: getField("admissionDeniedRateLimited"), + deliveryAttempts: getField("deliveryAttempts"), + deliverySuccesses: getField("deliverySuccesses"), + deliveryFailures: getField("deliveryFailures"), + deliveryRateLimited: getField("deliveryRateLimited"), + }; + }); +} + +const RATE_TIMELINE_BIN_SECONDS = 10; + +export async function queryPerClientRateTimeline( + client: CloudWatchLogsClient, + logGroupName: string, + startTimeSec: number, + endTimeSec: number, +): Promise { + const { queryId } = await client.send( + new StartQueryCommand({ + logGroupName, + startTime: startTimeSec, + endTime: endTimeSec, + queryString: [ + 'filter msg in ["Attempting delivery", "Admission denied"]', + `| stats sum(msg = "Attempting delivery") as deliveryAttempts by bin(@timestamp, ${RATE_TIMELINE_BIN_SECONDS}s) as timeBin`, + "| sort timeBin asc", + ].join("\n"), + }), + ); + + if (!queryId) return []; + + return pollAllQueryResults(client, queryId, (row) => { + const timeBinStr = row.find((f) => f.field === "timeBin")?.value ?? "0"; + const timestampSec = Math.floor(new Date(timeBinStr).getTime() / 1000); + const deliveryAttempts = Number( + row.find((f) => f.field === "deliveryAttempts")?.value ?? 0, + ); + + return { timestampSec, deliveryAttempts }; + }); +} diff --git a/lambdas/perf-runner-lambda/src/elasticache.ts b/lambdas/perf-runner-lambda/src/elasticache.ts new file mode 100644 index 00000000..8d0b86c6 --- /dev/null +++ b/lambdas/perf-runner-lambda/src/elasticache.ts @@ -0,0 +1,52 @@ +import { type RedisClientType, createClient } from "@redis/client"; +import { SignatureV4 } from "@smithy/signature-v4"; +import { Sha256 } from "@aws-crypto/sha256-js"; +import { fromNodeProviderChain } from "@aws-sdk/credential-providers"; +import type { ElastiCacheDeps } from "types"; + +const TOKEN_EXPIRY_SECONDS = 900; + +async function generateIamToken(deps: ElastiCacheDeps): Promise { + const signer = new SignatureV4({ + credentials: fromNodeProviderChain(), + region: deps.region, + service: "elasticache", + sha256: Sha256, + }); + + const signed = await signer.presign( + { + protocol: "https:", + method: "GET", + hostname: deps.cacheName, + path: "/", + query: { Action: "connect", User: deps.iamUsername }, + headers: { host: deps.cacheName }, + }, + { expiresIn: TOKEN_EXPIRY_SECONDS }, + ); + + const qs = new URLSearchParams( + signed.query as Record, + ).toString(); + return `${deps.cacheName}/?${qs}`; +} + +export async function flushElastiCache(deps: ElastiCacheDeps): Promise { + const token = await generateIamToken(deps); + + const client: RedisClientType = createClient({ + url: `rediss://${deps.endpoint}:6379`, + username: deps.iamUsername, + password: token, + }); + + try { + await client.connect(); + await client.flushAll(); + } finally { + if (client.isOpen) { + await client.disconnect(); + } + } +} diff --git a/lambdas/perf-runner-lambda/src/index.ts b/lambdas/perf-runner-lambda/src/index.ts index a0881866..5974627b 100644 --- a/lambdas/perf-runner-lambda/src/index.ts +++ b/lambdas/perf-runner-lambda/src/index.ts @@ -3,7 +3,11 @@ import { SQSClient } from "@aws-sdk/client-sqs"; import { Logger } from "@nhs-notify-client-callbacks/logger"; import { runPerformanceTest } from "runner"; import { DEFAULT_SCENARIO } from "scenario"; -import type { PerfRunnerPayload, PerformanceResult } from "types"; +import type { + ElastiCacheDeps, + PerfRunnerPayload, + PerformanceResult, +} from "types"; const logger = new Logger(); @@ -16,6 +20,10 @@ export async function handler( const queueUrl = process.env.INBOUND_QUEUE_URL; const logGroupName = process.env.TRANSFORM_FILTER_LOG_GROUP; const deliveryLogGroupPrefix = process.env.DELIVERY_LOG_GROUP_PREFIX; + const mockWebhookLogGroup = process.env.MOCK_WEBHOOK_LOG_GROUP; + const elasticacheEndpoint = process.env.ELASTICACHE_ENDPOINT; + const elasticacheCacheName = process.env.ELASTICACHE_CACHE_NAME; + const elasticacheIamUsername = process.env.ELASTICACHE_IAM_USERNAME; if (!queueUrl) { throw new Error("Missing required environment variable: INBOUND_QUEUE_URL"); @@ -30,6 +38,16 @@ export async function handler( const sqsClient = new SQSClient({ region }); const cloudWatchClient = new CloudWatchLogsClient({ region }); + const elastiCacheDeps: ElastiCacheDeps | undefined = + elasticacheEndpoint && elasticacheCacheName && elasticacheIamUsername + ? { + endpoint: elasticacheEndpoint, + cacheName: elasticacheCacheName, + iamUsername: elasticacheIamUsername, + region, + } + : undefined; + logger.info("Performance test started", { testId }); try { @@ -40,9 +58,12 @@ export async function handler( queueUrl, logGroupName, deliveryLogGroupPrefix, + mockWebhookLogGroup, }, scenario, testId, + undefined, + elastiCacheDeps, ); logger.info("Performance test completed", { testId }); diff --git a/lambdas/perf-runner-lambda/src/purge.ts b/lambdas/perf-runner-lambda/src/purge.ts new file mode 100644 index 00000000..5743e9d2 --- /dev/null +++ b/lambdas/perf-runner-lambda/src/purge.ts @@ -0,0 +1,40 @@ +import { PurgeQueueCommand, type SQSClient } from "@aws-sdk/client-sqs"; +import type { Scenario } from "types"; + +export function deriveQueueUrls( + inboundQueueUrl: string, + scenario: Scenario, +): string[] { + // eslint-disable-next-line sonarjs/null-dereference -- String.replace always returns a string + const baseUrl = inboundQueueUrl.replace(/inbound-event-queue$/, ""); + const clientIds = [...new Set(scenario.eventMix.map((e) => e.clientId))]; + + return [ + inboundQueueUrl, + `${baseUrl}inbound-event-dlq-queue`, + ...clientIds.flatMap((id) => [ + `${baseUrl}${id}-delivery-queue`, + `${baseUrl}${id}-delivery-dlq-queue`, + ]), + ]; +} + +export async function purgeQueues( + client: SQSClient, + queueUrls: string[], +): Promise { + const results = await Promise.allSettled( + queueUrls.map((url) => + client.send(new PurgeQueueCommand({ QueueUrl: url })), + ), + ); + + for (const result of results) { + if (result.status === "rejected") { + const error = result.reason as { name?: string }; + if (error.name !== "AWS.SimpleQueueService.NonExistentQueue") { + throw result.reason as Error; + } + } + } +} diff --git a/lambdas/perf-runner-lambda/src/runner.ts b/lambdas/perf-runner-lambda/src/runner.ts index a265e90e..321abc45 100644 --- a/lambdas/perf-runner-lambda/src/runner.ts +++ b/lambdas/perf-runner-lambda/src/runner.ts @@ -1,13 +1,25 @@ import type { + CircuitBreakerSnapshot, DeliveryMetricsSnapshot, + ElastiCacheDeps, MetricsSnapshot, + PerClientRateTimeline, PerformanceResult, PhaseResult, RunnerDeps, Scenario, + WebhookVerificationResult, } from "types"; import { generatePhaseLoad } from "sqs"; -import { queryDeliveryMetricsSnapshot, queryMetricsSnapshot } from "cloudwatch"; +import { deriveQueueUrls, purgeQueues } from "purge"; +import { flushElastiCache } from "elasticache"; +import { verifyMockWebhook } from "webhook-verify"; +import { + queryCircuitBreakerSnapshot, + queryDeliveryMetricsSnapshot, + queryMetricsSnapshot, + queryPerClientRateTimeline, +} from "cloudwatch"; const CLOUDWATCH_SETTLING_MS = 60_000; @@ -25,11 +37,56 @@ function buildDeliveryLogGroupNames( return [...clientIds].map((id) => `${prefix}${id}`); } +async function collectSnapshots( + deps: RunnerDeps, + deliveryLogGroupNames: string[], + startSec: number, + endSec: number, + cbStartSec: number, + out: { + snapshots: MetricsSnapshot[]; + deliverySnapshots: DeliveryMetricsSnapshot[]; + cbSnapshots: CircuitBreakerSnapshot[]; + }, +): Promise { + const snap = await queryMetricsSnapshot( + deps.cloudWatchClient, + deps.logGroupName, + startSec, + endSec, + ); + if (snap !== null) out.snapshots.push(snap); + + if (deliveryLogGroupNames.length > 0) { + const deliverySnap = await queryDeliveryMetricsSnapshot( + deps.cloudWatchClient, + deliveryLogGroupNames, + startSec, + endSec, + ); + if (deliverySnap !== null) out.deliverySnapshots.push(deliverySnap); + + const cbSnap = await queryCircuitBreakerSnapshot( + deps.cloudWatchClient, + deliveryLogGroupNames, + cbStartSec, + endSec, + ); + if (cbSnap !== null) { + out.cbSnapshots.push(cbSnap); + return endSec; + } + } + + return cbStartSec; +} + export async function runPerformanceTest( deps: RunnerDeps, scenario: Scenario, testId: string, sleepFn: (ms: number) => Promise = defaultSleep, + elastiCacheDeps?: ElastiCacheDeps, ): Promise { if (scenario.eventMix.length === 0) { throw new Error("scenario.eventMix must contain at least one entry"); @@ -49,10 +106,19 @@ export async function runPerformanceTest( } const testStartMs = Date.now(); + + const queueUrls = deriveQueueUrls(deps.queueUrl, scenario); + await purgeQueues(deps.sqsClient, queueUrls); + if (elastiCacheDeps) { + await flushElastiCache(elastiCacheDeps); + } + const startedAt = new Date(testStartMs).toISOString(); const phaseResults: PhaseResult[] = []; const snapshots: MetricsSnapshot[] = []; const deliverySnapshots: DeliveryMetricsSnapshot[] = []; + const cbSnapshots: CircuitBreakerSnapshot[] = []; + let lastCbSnapshotSec = Math.floor(testStartMs / 1000); let stopPolling = false; const deliveryLogGroupNames = buildDeliveryLogGroupNames( @@ -60,29 +126,22 @@ export async function runPerformanceTest( scenario, ); + const out = { snapshots, deliverySnapshots, cbSnapshots }; + const pollLoop = async (): Promise => { await sleepFn(scenario.metricsIntervalSecs * 1000); while (!stopPolling) { const startSec = Math.floor(testStartMs / 1000); const endSec = Math.floor(Date.now() / 1000); - const snap = await queryMetricsSnapshot( - deps.cloudWatchClient, - deps.logGroupName, + lastCbSnapshotSec = await collectSnapshots( + deps, + deliveryLogGroupNames, startSec, endSec, + lastCbSnapshotSec, + out, ); - if (snap !== null) snapshots.push(snap); - - if (deliveryLogGroupNames.length > 0) { - const deliverySnap = await queryDeliveryMetricsSnapshot( - deps.cloudWatchClient, - deliveryLogGroupNames, - startSec, - endSec, - ); - if (deliverySnap !== null) deliverySnapshots.push(deliverySnap); - } if (!stopPolling) { await sleepFn(scenario.metricsIntervalSecs * 1000); @@ -110,22 +169,48 @@ export async function runPerformanceTest( const finalStartSec = Math.floor(testStartMs / 1000); const finalEndSec = Math.floor(Date.now() / 1000); - const finalSnap = await queryMetricsSnapshot( - deps.cloudWatchClient, - deps.logGroupName, + await collectSnapshots( + deps, + deliveryLogGroupNames, finalStartSec, finalEndSec, + lastCbSnapshotSec, + out, ); - if (finalSnap !== null) snapshots.push(finalSnap); - if (deliveryLogGroupNames.length > 0) { - const finalDeliverySnap = await queryDeliveryMetricsSnapshot( + const perClientRateTimelines: PerClientRateTimeline[] = []; + + if (deps.deliveryLogGroupPrefix) { + const clientIds = [...new Set(scenario.eventMix.map((e) => e.clientId))]; + const timelinePromises = clientIds.map(async (clientId) => { + const logGroupName = `${deps.deliveryLogGroupPrefix}${clientId}`; + const entries = await queryPerClientRateTimeline( + deps.cloudWatchClient, + logGroupName, + finalStartSec, + finalEndSec, + ); + return { clientId, entries }; + }); + const timelines = await Promise.all(timelinePromises); + perClientRateTimelines.push( + ...timelines.filter((t) => t.entries.length > 0), + ); + } + + let webhookVerification: WebhookVerificationResult | undefined; + if (deps.mockWebhookLogGroup) { + webhookVerification = await verifyMockWebhook( deps.cloudWatchClient, - deliveryLogGroupNames, + deps.mockWebhookLogGroup, finalStartSec, finalEndSec, ); - if (finalDeliverySnap !== null) deliverySnapshots.push(finalDeliverySnap); + } + + await purgeQueues(deps.sqsClient, queueUrls); + if (elastiCacheDeps) { + await flushElastiCache(elastiCacheDeps); } return { @@ -136,5 +221,8 @@ export async function runPerformanceTest( phases: phaseResults, metrics: snapshots, deliveryMetrics: deliverySnapshots, + circuitBreakerMetrics: cbSnapshots, + perClientRateTimelines, + webhookVerification, }; } diff --git a/lambdas/perf-runner-lambda/src/types.ts b/lambdas/perf-runner-lambda/src/types.ts index 5366602d..24df2a50 100644 --- a/lambdas/perf-runner-lambda/src/types.ts +++ b/lambdas/perf-runner-lambda/src/types.ts @@ -55,6 +55,35 @@ export type DeliveryMetricsSnapshot = { p99Ms: number; }; +export type CircuitBreakerSnapshot = { + snapshotAt: number; + intervalStartSec: number; + intervalEndSec: number; + circuitOpenEvents: number; + circuitCloseEvents: number; + admissionDeniedCircuitOpen: number; + admissionDeniedRateLimited: number; + deliveryAttempts: number; + deliverySuccesses: number; + deliveryFailures: number; + deliveryRateLimited: number; +}; + +export type PerClientRateEntry = { + timestampSec: number; + deliveryAttempts: number; +}; + +export type PerClientRateTimeline = { + clientId: string; + entries: PerClientRateEntry[]; +}; + +export type WebhookVerificationResult = { + receivedCallbacks: number; + verified: boolean; +}; + export type PerformanceResult = { testId: string; scenario: Scenario; @@ -63,6 +92,9 @@ export type PerformanceResult = { phases: PhaseResult[]; metrics: MetricsSnapshot[]; deliveryMetrics: DeliveryMetricsSnapshot[]; + circuitBreakerMetrics: CircuitBreakerSnapshot[]; + perClientRateTimelines?: PerClientRateTimeline[]; + webhookVerification?: WebhookVerificationResult; }; export type PerfRunnerPayload = { @@ -76,4 +108,12 @@ export type RunnerDeps = { queueUrl: string; logGroupName: string; deliveryLogGroupPrefix?: string; + mockWebhookLogGroup?: string; +}; + +export type ElastiCacheDeps = { + endpoint: string; + cacheName: string; + iamUsername: string; + region: string; }; diff --git a/lambdas/perf-runner-lambda/src/webhook-verify.ts b/lambdas/perf-runner-lambda/src/webhook-verify.ts new file mode 100644 index 00000000..77c1fa6d --- /dev/null +++ b/lambdas/perf-runner-lambda/src/webhook-verify.ts @@ -0,0 +1,59 @@ +import { + type CloudWatchLogsClient, + GetQueryResultsCommand, + StartQueryCommand, +} from "@aws-sdk/client-cloudwatch-logs"; +import type { WebhookVerificationResult } from "types"; + +const INSIGHTS_POLL_INTERVAL_MS = 2000; +const INSIGHTS_TIMEOUT_MS = 30_000; + +export async function verifyMockWebhook( + client: CloudWatchLogsClient, + logGroupName: string, + startTimeSec: number, + endTimeSec: number, +): Promise { + const { queryId } = await client.send( + new StartQueryCommand({ + logGroupName, + startTime: startTimeSec, + endTime: endTimeSec, + queryString: [ + 'filter msg = "Callback received"', + "| stats count(*) as callbackCount", + ].join("\n"), + }), + ); + + if (!queryId) { + return { receivedCallbacks: 0, verified: false }; + } + + const deadline = Date.now() + INSIGHTS_TIMEOUT_MS; + + while (Date.now() < deadline) { + await new Promise((resolve) => { + setTimeout(resolve, INSIGHTS_POLL_INTERVAL_MS); + }); + + const response = await client.send(new GetQueryResultsCommand({ queryId })); + + if (response.status === "Failed" || response.status === "Cancelled") { + return { receivedCallbacks: 0, verified: false }; + } + + if (response.status === "Complete") { + const rows = + (response.results as { field?: string; value?: string }[][]) ?? []; + const row = rows[0] ?? []; + const count = Number( + row.find((f) => f.field === "callbackCount")?.value ?? 0, + ); + + return { receivedCallbacks: count, verified: count > 0 }; + } + } + + return { receivedCallbacks: 0, verified: false }; +} diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index c497eafb..f2b2aa3a 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -450,21 +450,30 @@ importers: lambdas/perf-runner-lambda: dependencies: + '@aws-crypto/sha256-js': + specifier: catalog:aws + version: 5.2.0 '@aws-sdk/client-cloudwatch-logs': specifier: catalog:aws version: 3.1026.0 '@aws-sdk/client-sqs': specifier: catalog:aws version: 3.1026.0 + '@aws-sdk/credential-providers': + specifier: catalog:aws + version: 3.1026.0 '@nhs-notify-client-callbacks/logger': specifier: workspace:* version: link:../../src/logger '@nhs-notify-client-callbacks/models': specifier: workspace:* version: link:../../src/models - esbuild: - specifier: catalog:tools - version: 0.28.0 + '@redis/client': + specifier: catalog:app + version: 1.6.1 + '@smithy/signature-v4': + specifier: catalog:aws + version: 5.3.13 devDependencies: '@tsconfig/node22': specifier: catalog:tools @@ -478,6 +487,9 @@ importers: '@types/node': specifier: catalog:tools version: 25.6.0 + esbuild: + specifier: catalog:tools + version: 0.28.0 eslint: specifier: catalog:lint version: 9.39.4(jiti@2.6.1) From 07410f72626213d7ba3b1936ae92e904c1ad5702 Mon Sep 17 00:00:00 2001 From: Rhys Cox Date: Mon, 27 Apr 2026 10:02:29 +0100 Subject: [PATCH 2/9] CCM-16073 - Fixed perf runner permissions --- .../callbacks/module_perf_runner_lambda.tf | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/infrastructure/terraform/components/callbacks/module_perf_runner_lambda.tf b/infrastructure/terraform/components/callbacks/module_perf_runner_lambda.tf index f3f57981..7a77c40c 100644 --- a/infrastructure/terraform/components/callbacks/module_perf_runner_lambda.tf +++ b/infrastructure/terraform/components/callbacks/module_perf_runner_lambda.tf @@ -143,4 +143,19 @@ data "aws_iam_policy_document" "perf_runner_lambda" { aws_elasticache_user.delivery_state_iam.arn, ] } + + statement { + sid = "VPCNetworkInterfacePermissions" + effect = "Allow" + + actions = [ + "ec2:CreateNetworkInterface", + "ec2:DeleteNetworkInterface", + "ec2:DescribeNetworkInterfaces", + ] + + resources = [ + "*", + ] + } } From 6673cca4d97f6fa3844d2e5e70c6ef7ab0e01619 Mon Sep 17 00:00:00 2001 From: Rhys Cox Date: Tue, 28 Apr 2026 09:53:51 +0100 Subject: [PATCH 3/9] CCM-16073 - Updated rate limiting behaviour --- .../terraform/components/callbacks/README.md | 1 + .../callbacks/module_client_delivery.tf | 2 ++ .../components/callbacks/variables.tf | 6 ++++ .../modules/client-delivery/README.md | 1 + .../module_https_client_lambda.tf | 1 + .../modules/client-delivery/variables.tf | 6 ++++ .../src/__tests__/endpoint-gate.test.ts | 28 ++++++++++++++----- .../src/__tests__/handler.test.ts | 27 ++++++++++++++++-- .../src/__tests__/record-result-lua.test.ts | 6 ++-- .../src/services/endpoint-gate.ts | 4 +-- .../src/services/record-result.lua | 3 +- 11 files changed, 69 insertions(+), 16 deletions(-) diff --git a/infrastructure/terraform/components/callbacks/README.md b/infrastructure/terraform/components/callbacks/README.md index 02804698..e090abb9 100644 --- a/infrastructure/terraform/components/callbacks/README.md +++ b/infrastructure/terraform/components/callbacks/README.md @@ -45,6 +45,7 @@ | [s3\_enable\_force\_destroy](#input\_s3\_enable\_force\_destroy) | Whether to enable force destroy for the S3 buckets created in this module | `bool` | `false` | no | | [sqs\_inbound\_event\_max\_receive\_count](#input\_sqs\_inbound\_event\_max\_receive\_count) | n/a | `number` | `3` | no | | [sqs\_inbound\_event\_visibility\_timeout\_seconds](#input\_sqs\_inbound\_event\_visibility\_timeout\_seconds) | n/a | `number` | `60` | no | +| [token\_bucket\_burst\_capacity](#input\_token\_bucket\_burst\_capacity) | Token bucket burst capacity used by the rate limiter | `number` | `2250` | no | ## Modules | Name | Source | Version | diff --git a/infrastructure/terraform/components/callbacks/module_client_delivery.tf b/infrastructure/terraform/components/callbacks/module_client_delivery.tf index ebc2e9e1..5122606e 100644 --- a/infrastructure/terraform/components/callbacks/module_client_delivery.tf +++ b/infrastructure/terraform/components/callbacks/module_client_delivery.tf @@ -41,6 +41,8 @@ module "client_delivery" { mtls_test_cert_s3_key = local.mtls_test_cert_s3_key # gitleaks:allow mtls_test_ca_s3_key = local.mtls_test_ca_s3_key # gitleaks:allow + token_bucket_burst_capacity = var.token_bucket_burst_capacity + vpc_subnet_ids = try(local.acct.private_subnets[local.bc_name], []) lambda_security_group_id = aws_security_group.https_client_lambda.id } diff --git a/infrastructure/terraform/components/callbacks/variables.tf b/infrastructure/terraform/components/callbacks/variables.tf index 9c71492d..aef32373 100644 --- a/infrastructure/terraform/components/callbacks/variables.tf +++ b/infrastructure/terraform/components/callbacks/variables.tf @@ -195,3 +195,9 @@ variable "elasticache_data_storage_maximum_gb" { description = "Maximum data storage in GB for the ElastiCache Serverless delivery state cache" default = 1 } + +variable "token_bucket_burst_capacity" { + type = number + description = "Token bucket burst capacity used by the rate limiter" + default = 2250 +} diff --git a/infrastructure/terraform/modules/client-delivery/README.md b/infrastructure/terraform/modules/client-delivery/README.md index 0a4965e7..2036c60d 100644 --- a/infrastructure/terraform/modules/client-delivery/README.md +++ b/infrastructure/terraform/modules/client-delivery/README.md @@ -45,6 +45,7 @@ No requirements. | [sqs\_visibility\_timeout\_seconds](#input\_sqs\_visibility\_timeout\_seconds) | Visibility timeout for the per-client delivery queue | `number` | `60` | no | | [subscription\_targets](#input\_subscription\_targets) | Flattened subscription-target fanout map keyed by subscription-target composite key |
map(object({
subscription_id = string
target_id = string
}))
| n/a | yes | | [subscriptions](#input\_subscriptions) | Subscription definitions for this client, keyed by subscription\_id |
map(object({
subscription_id = string
target_ids = list(string)
}))
| n/a | yes | +| [token\_bucket\_burst\_capacity](#input\_token\_bucket\_burst\_capacity) | Token bucket burst capacity used by the rate limiter | `number` | `2250` | no | | [vpc\_subnet\_ids](#input\_vpc\_subnet\_ids) | VPC subnet IDs for Lambda execution | `list(string)` | `[]` | no | ## Modules diff --git a/infrastructure/terraform/modules/client-delivery/module_https_client_lambda.tf b/infrastructure/terraform/modules/client-delivery/module_https_client_lambda.tf index 1260d471..0021fb80 100644 --- a/infrastructure/terraform/modules/client-delivery/module_https_client_lambda.tf +++ b/infrastructure/terraform/modules/client-delivery/module_https_client_lambda.tf @@ -53,6 +53,7 @@ module "https_client_lambda" { MTLS_TEST_CERT_S3_BUCKET = var.mtls_test_cert_s3_bucket MTLS_TEST_CERT_S3_KEY = var.mtls_test_cert_s3_key # gitleaks:allow QUEUE_URL = module.sqs_delivery.sqs_queue_url + TOKEN_BUCKET_BURST_CAPACITY = tostring(var.token_bucket_burst_capacity) } vpc_config = var.lambda_security_group_id != "" ? { diff --git a/infrastructure/terraform/modules/client-delivery/variables.tf b/infrastructure/terraform/modules/client-delivery/variables.tf index 643e163e..801ca291 100644 --- a/infrastructure/terraform/modules/client-delivery/variables.tf +++ b/infrastructure/terraform/modules/client-delivery/variables.tf @@ -181,6 +181,12 @@ variable "mtls_test_ca_s3_key" { default = "" } +variable "token_bucket_burst_capacity" { + type = number + description = "Token bucket burst capacity used by the rate limiter" + default = 2250 +} + variable "elasticache_endpoint" { type = string description = "ElastiCache Serverless endpoint URL" diff --git a/lambdas/https-client-lambda/src/__tests__/endpoint-gate.test.ts b/lambdas/https-client-lambda/src/__tests__/endpoint-gate.test.ts index 2cc8cc31..c8327c3a 100644 --- a/lambdas/https-client-lambda/src/__tests__/endpoint-gate.test.ts +++ b/lambdas/https-client-lambda/src/__tests__/endpoint-gate.test.ts @@ -197,8 +197,8 @@ describe("evalScript", () => { }); describe("recordResult", () => { - it("returns closed on success below threshold", async () => { - mockSendCommand.mockResolvedValueOnce([1, "closed"]); + it("returns ok on steady-state success below threshold", async () => { + mockSendCommand.mockResolvedValueOnce([1, "ok"]); const result = await recordResult( mockRedis, @@ -208,7 +208,7 @@ describe("recordResult", () => { defaultConfig, ); - expect(result).toEqual({ ok: true, state: "closed" }); + expect(result).toEqual({ ok: true, state: "ok" }); expect(mockSendCommand).toHaveBeenCalledWith( expect.arrayContaining(["EVALSHA"]), ); @@ -228,6 +228,20 @@ describe("recordResult", () => { expect(result).toEqual({ ok: false, state: "opened" }); }); + it("returns closed when circuit transitions from open to closed", async () => { + mockSendCommand.mockResolvedValueOnce([1, "closed"]); + + const result = await recordResult( + mockRedis, + "target-1", + 5, + 0, + defaultConfig, + ); + + expect(result).toEqual({ ok: true, state: "closed" }); + }); + it("returns failed when failure is below threshold", async () => { mockSendCommand.mockResolvedValueOnce([0, "failed"]); @@ -245,7 +259,7 @@ describe("recordResult", () => { it("falls back to EVAL on NOSCRIPT error", async () => { mockSendCommand .mockRejectedValueOnce(new Error("NOSCRIPT No matching script")) - .mockResolvedValueOnce([1, "closed"]); + .mockResolvedValueOnce([1, "ok"]); const result = await recordResult( mockRedis, @@ -255,12 +269,12 @@ describe("recordResult", () => { defaultConfig, ); - expect(result).toEqual({ ok: true, state: "closed" }); + expect(result).toEqual({ ok: true, state: "ok" }); expect(mockSendCommand).toHaveBeenCalledTimes(2); }); it("passes correct ep key for target", async () => { - mockSendCommand.mockResolvedValueOnce([1, "closed"]); + mockSendCommand.mockResolvedValueOnce([1, "ok"]); await recordResult(mockRedis, "my-target", 1, 0, defaultConfig); @@ -269,7 +283,7 @@ describe("recordResult", () => { }); it("passes consumedTokens and processingFailures as ARGV", async () => { - mockSendCommand.mockResolvedValueOnce([1, "closed"]); + mockSendCommand.mockResolvedValueOnce([1, "ok"]); await recordResult(mockRedis, "target-1", 8, 3, defaultConfig); diff --git a/lambdas/https-client-lambda/src/__tests__/handler.test.ts b/lambdas/https-client-lambda/src/__tests__/handler.test.ts index a2b7e8b4..f69d1d51 100644 --- a/lambdas/https-client-lambda/src/__tests__/handler.test.ts +++ b/lambdas/https-client-lambda/src/__tests__/handler.test.ts @@ -116,7 +116,7 @@ describe("processRecords", () => { consumedTokens: 100, effectiveRate: 10, }); - mockRecordResult.mockResolvedValue({ ok: true, state: "closed" }); + mockRecordResult.mockResolvedValue({ ok: true, state: "ok" }); }); it("returns no failures on successful delivery", async () => { @@ -515,7 +515,7 @@ describe("processRecords", () => { expect(recordCircuitBreakerOpen).not.toHaveBeenCalled(); }); - it("does not record CircuitBreakerOpen when recordResult returns closed", async () => { + it("does not record CircuitBreakerOpen when recordResult returns ok", async () => { const targetCb = { ...DEFAULT_TARGET, delivery: { circuitBreaker: { enabled: true } }, @@ -525,7 +525,7 @@ describe("processRecords", () => { outcome: "transient_failure", statusCode: 503, }); - mockRecordResult.mockResolvedValue({ ok: true, state: "closed" }); + mockRecordResult.mockResolvedValue({ ok: true, state: "ok" }); const { recordCircuitBreakerOpen } = jest.requireMock( "services/delivery-observability", @@ -536,6 +536,27 @@ describe("processRecords", () => { expect(recordCircuitBreakerOpen).not.toHaveBeenCalled(); }); + it("records CircuitBreakerClosed when recordResult returns closed", async () => { + const targetCb = { + ...DEFAULT_TARGET, + delivery: { circuitBreaker: { enabled: true } }, + }; + mockLoadTargetConfig.mockResolvedValue(targetCb); + mockDeliverPayload.mockResolvedValue({ + outcome: "success", + statusCode: 200, + }); + mockRecordResult.mockResolvedValue({ ok: true, state: "closed" }); + + const { recordCircuitBreakerClosed } = jest.requireMock( + "services/delivery-observability", + ); + + await processRecords([makeRecord()]); + + expect(recordCircuitBreakerClosed).toHaveBeenCalledWith("target-1"); + }); + it("records RateLimited on 429 response", async () => { mockDeliverPayload.mockResolvedValue({ outcome: "rate_limited", diff --git a/lambdas/https-client-lambda/src/__tests__/record-result-lua.test.ts b/lambdas/https-client-lambda/src/__tests__/record-result-lua.test.ts index 5cc407fe..48495de1 100644 --- a/lambdas/https-client-lambda/src/__tests__/record-result-lua.test.ts +++ b/lambdas/https-client-lambda/src/__tests__/record-result-lua.test.ts @@ -3,7 +3,7 @@ import { createRedisStore, evalLua } from "__tests__/helpers/lua-redis-mock"; // ARGV: [now, consumedTokens, processingFailures, cooldownPeriodMs, recoveryPeriodMs, failureThreshold, minAttempts, samplePeriodMs] // KEYS: [epKey] -// Returns: [ok (0|1), state] state: "closed" | "opened" | "failed" +// Returns: [ok (0|1), state] state: "ok" | "closed" | "opened" | "failed" type RecordResultArgs = { now: number; @@ -54,7 +54,7 @@ function runRecordResult( describe("record-result.lua", () => { describe("success recording", () => { - it("returns closed state for a successful batch", () => { + it("returns ok state for a successful batch with no state change", () => { const store = createRedisStore(); store.set("ep:t1", new Map([["sample_till", "9999999999"]])); @@ -64,7 +64,7 @@ describe("record-result.lua", () => { }); expect(ok).toBe(1); - expect(state).toBe("closed"); + expect(state).toBe("ok"); }); it("increments cur_attempts without incrementing cur_failures", () => { diff --git a/lambdas/https-client-lambda/src/services/endpoint-gate.ts b/lambdas/https-client-lambda/src/services/endpoint-gate.ts index bf9c1462..8a3b9089 100644 --- a/lambdas/https-client-lambda/src/services/endpoint-gate.ts +++ b/lambdas/https-client-lambda/src/services/endpoint-gate.ts @@ -19,7 +19,7 @@ export type AdmitResultDenied = { export type AdmitResult = AdmitResultAllowed | AdmitResultDenied; export type RecordResultOutcome = - | { ok: true; state: "closed" } + | { ok: true; state: "closed" | "ok" } | { ok: false; state: "opened" | "failed" }; export type EndpointGateConfig = { @@ -159,7 +159,7 @@ export async function recordResult( const [ok, state] = raw; if (ok === 1) { - return { ok: true, state: "closed" }; + return { ok: true, state: state as "closed" | "ok" }; } return { ok: false, state: state as "opened" | "failed" }; diff --git a/lambdas/https-client-lambda/src/services/record-result.lua b/lambdas/https-client-lambda/src/services/record-result.lua index fa3b1b12..c8c6a0a6 100644 --- a/lambdas/https-client-lambda/src/services/record-result.lua +++ b/lambdas/https-client-lambda/src/services/record-result.lua @@ -14,6 +14,7 @@ local OPENED = "opened" local CLOSED = "closed" local FAILED = "failed" +local OK = "ok" -- Keys local epKey = KEYS[1] -- ep:{targetId} combined endpoint state hash @@ -147,4 +148,4 @@ if isOpen or processingFailures > 0 then return { 0, FAILED } end -return { 1, CLOSED } +return { 1, OK } From 9b0a5114051a1125403a8b3d5b344601a23bf2b7 Mon Sep 17 00:00:00 2001 From: Rhys Cox Date: Tue, 28 Apr 2026 12:24:08 +0100 Subject: [PATCH 4/9] CCM-16073 - Updated rate limiting behaviour --- .../src/__tests__/admit-lua.test.ts | 80 ++++++++-- .../src/__tests__/endpoint-gate.test.ts | 35 +++-- .../src/__tests__/handler.test.ts | 28 +++- .../src/__tests__/record-result-lua.test.ts | 147 +++++++++++++----- lambdas/https-client-lambda/src/handler.ts | 7 +- .../src/services/admit.lua | 22 +-- .../src/services/endpoint-gate.ts | 22 +-- .../src/services/record-result.lua | 66 +++++--- 8 files changed, 290 insertions(+), 117 deletions(-) diff --git a/lambdas/https-client-lambda/src/__tests__/admit-lua.test.ts b/lambdas/https-client-lambda/src/__tests__/admit-lua.test.ts index f4906cf2..43aa2fb6 100644 --- a/lambdas/https-client-lambda/src/__tests__/admit-lua.test.ts +++ b/lambdas/https-client-lambda/src/__tests__/admit-lua.test.ts @@ -62,7 +62,7 @@ function runAdmit( describe("admit.lua", () => { describe("rate limiting", () => { - it("enters recovery ramp-up on a fresh endpoint with no prior state", () => { + it("enters half-open probe on a fresh endpoint with no prior state", () => { const store = createRedisStore(); const now = 1_000_000; @@ -73,29 +73,39 @@ describe("admit.lua", () => { expect(consumedTokens).toBe(0); expect(reason).toBe("rate_limited"); - expect(effectiveRate).toBe(0); + expect(effectiveRate).toBeCloseTo(1 / 60, 5); }); - it("persists switched_at on first contact so recovery ramp progresses", () => { + it("does not persist circuit state on first contact", () => { const store = createRedisStore(); const now = 1_000_000; runAdmit(store, { now, targetRateLimit: 10 }); const epHash = store.get("ep:t1")!; - expect(epHash.get("switched_at")).toBe(now.toString()); + expect(epHash.has("is_open")).toBe(false); + expect(epHash.has("switched_at")).toBe(false); }); - it("ramps up rate on subsequent calls after fresh endpoint initialisation", () => { + it("allows full rate after record-result closes the circuit", () => { const store = createRedisStore(); const now = 1_000_000; - const later = now + 60_000; - runAdmit(store, { now, targetRateLimit: 10 }); + store.set( + "ep:t1", + new Map([ + ["is_open", "0"], + ["switched_at", now.toString()], + ["bucket_tokens", "0"], + ["bucket_refilled_at", now.toString()], + ]), + ); + const later = now + 60_000; const { consumedTokens, reason } = runAdmit(store, { now: later, targetRateLimit: 10, + recoveryPeriodMs: 600_000, }); expect(consumedTokens).toBeGreaterThanOrEqual(1); @@ -108,6 +118,7 @@ describe("admit.lua", () => { store.set( "ep:t1", new Map([ + ["is_open", "0"], ["bucket_tokens", "0"], ["bucket_refilled_at", "0"], ["switched_at", "0"], @@ -130,6 +141,7 @@ describe("admit.lua", () => { store.set( "ep:t1", new Map([ + ["is_open", "0"], ["bucket_tokens", "5"], ["bucket_refilled_at", now.toString()], ["switched_at", "0"], @@ -149,6 +161,7 @@ describe("admit.lua", () => { store.set( "ep:t1", new Map([ + ["is_open", "0"], ["bucket_tokens", "2"], ["bucket_refilled_at", now.toString()], ["switched_at", "0"], @@ -168,6 +181,7 @@ describe("admit.lua", () => { store.set( "ep:t1", new Map([ + ["is_open", "0"], ["bucket_tokens", "0"], ["bucket_refilled_at", now.toString()], ["switched_at", "0"], @@ -186,6 +200,7 @@ describe("admit.lua", () => { store.set( "ep:t1", new Map([ + ["is_open", "0"], ["bucket_tokens", "0"], ["bucket_refilled_at", now.toString()], ["switched_at", "0"], @@ -205,6 +220,7 @@ describe("admit.lua", () => { store.set( "ep:t1", new Map([ + ["is_open", "0"], ["bucket_tokens", "0"], ["bucket_refilled_at", "0"], ["switched_at", "0"], @@ -226,6 +242,7 @@ describe("admit.lua", () => { store.set( "ep:t1", new Map([ + ["is_open", "0"], ["bucket_tokens", "0"], ["bucket_refilled_at", now.toString()], ["switched_at", "0"], @@ -246,6 +263,7 @@ describe("admit.lua", () => { store.set( "ep:t1", new Map([ + ["is_open", "0"], ["bucket_tokens", "0"], ["bucket_refilled_at", (now - 150).toString()], ["switched_at", "0"], @@ -345,6 +363,32 @@ describe("admit.lua", () => { expect(effectiveRate).toBeCloseTo(1 / 60, 5); }); + it("zeroes residual bucket tokens when circuit is half-open", () => { + const store = createRedisStore(); + const now = 1_000_000; + const switchedAt = now - 130_000; + + store.set( + "ep:t1", + new Map([ + ["is_open", "1"], + ["switched_at", switchedAt.toString()], + ["bucket_tokens", "100"], + ["bucket_refilled_at", (now - 60_000).toString()], + ]), + ); + + const { consumedTokens } = runAdmit(store, { + now, + cooldownMs: 120_000, + probeRateLimit: 1 / 60, + }); + + expect(consumedTokens).toBe(1); + const epHash = store.get("ep:t1")!; + expect(Number(epHash.get("bucket_tokens"))).toBe(0); + }); + it("uses recovery ramp when closed during recovery period", () => { const store = createRedisStore(); const switchedAt = 1_000_000; @@ -366,7 +410,9 @@ describe("admit.lua", () => { targetRateLimit: 10, recoveryPeriodMs, }); - expect(effectiveRate).toBe(5); + const probeRate = defaultArgs.probeRateLimit; + const expectedRate = probeRate + 0.5 * (10 - probeRate); + expect(effectiveRate).toBeCloseTo(expectedRate, 5); }); it("uses full rate when closed and past recovery period", () => { @@ -401,6 +447,7 @@ describe("admit.lua", () => { store.set( "ep:t1", new Map([ + ["is_open", "0"], ["bucket_tokens", "5"], ["bucket_refilled_at", now.toString()], ["switched_at", "0"], @@ -413,16 +460,29 @@ describe("admit.lua", () => { expect(Number(epHash.get("bucket_tokens"))).toBe(3); }); - it("does not write sampling or circuit fields", () => { + it("does not write any fields when circuit_open early return", () => { const store = createRedisStore(); runAdmit(store, { now: 10_000, }); + expect(store.has("ep:t1")).toBe(false); + }); + + it("does not write sampling or circuit fields on half-open path", () => { + const store = createRedisStore(); + runAdmit(store, { + now: 200_000, + }); + const epHash = store.get("ep:t1")!; + expect(epHash.has("bucket_tokens")).toBe(true); + expect(epHash.has("bucket_refilled_at")).toBe(true); expect(epHash.has("cur_attempts")).toBe(false); expect(epHash.has("cur_failures")).toBe(false); expect(epHash.has("sample_till")).toBe(false); + expect(epHash.has("is_open")).toBe(false); + expect(epHash.has("switched_at")).toBe(false); }); it("isolates state between targets", () => { @@ -430,6 +490,7 @@ describe("admit.lua", () => { store.set( "ep:target-a", new Map([ + ["is_open", "0"], ["bucket_tokens", "5"], ["bucket_refilled_at", "10000"], ]), @@ -437,6 +498,7 @@ describe("admit.lua", () => { store.set( "ep:target-b", new Map([ + ["is_open", "0"], ["bucket_tokens", "3"], ["bucket_refilled_at", "10000"], ]), diff --git a/lambdas/https-client-lambda/src/__tests__/endpoint-gate.test.ts b/lambdas/https-client-lambda/src/__tests__/endpoint-gate.test.ts index c8327c3a..eea9d44d 100644 --- a/lambdas/https-client-lambda/src/__tests__/endpoint-gate.test.ts +++ b/lambdas/https-client-lambda/src/__tests__/endpoint-gate.test.ts @@ -197,8 +197,8 @@ describe("evalScript", () => { }); describe("recordResult", () => { - it("returns ok on steady-state success below threshold", async () => { - mockSendCommand.mockResolvedValueOnce([1, "ok"]); + it("returns closed state when circuit is steady-state", async () => { + mockSendCommand.mockResolvedValueOnce(["closed", 0]); const result = await recordResult( mockRedis, @@ -208,14 +208,14 @@ describe("recordResult", () => { defaultConfig, ); - expect(result).toEqual({ ok: true, state: "ok" }); + expect(result).toEqual({ circuitState: "closed", stateChanged: false }); expect(mockSendCommand).toHaveBeenCalledWith( expect.arrayContaining(["EVALSHA"]), ); }); - it("returns opened when failure crosses threshold", async () => { - mockSendCommand.mockResolvedValueOnce([0, "opened"]); + it("returns open with stateChanged when failure crosses threshold", async () => { + mockSendCommand.mockResolvedValueOnce(["open", 1]); const result = await recordResult( mockRedis, @@ -225,11 +225,11 @@ describe("recordResult", () => { defaultConfig, ); - expect(result).toEqual({ ok: false, state: "opened" }); + expect(result).toEqual({ circuitState: "open", stateChanged: true }); }); - it("returns closed when circuit transitions from open to closed", async () => { - mockSendCommand.mockResolvedValueOnce([1, "closed"]); + it("returns closed_recovery with stateChanged when circuit closes", async () => { + mockSendCommand.mockResolvedValueOnce(["closed_recovery", 1]); const result = await recordResult( mockRedis, @@ -239,11 +239,14 @@ describe("recordResult", () => { defaultConfig, ); - expect(result).toEqual({ ok: true, state: "closed" }); + expect(result).toEqual({ + circuitState: "closed_recovery", + stateChanged: true, + }); }); - it("returns failed when failure is below threshold", async () => { - mockSendCommand.mockResolvedValueOnce([0, "failed"]); + it("returns half_open without stateChanged when probing", async () => { + mockSendCommand.mockResolvedValueOnce(["half_open", 0]); const result = await recordResult( mockRedis, @@ -253,13 +256,13 @@ describe("recordResult", () => { defaultConfig, ); - expect(result).toEqual({ ok: false, state: "failed" }); + expect(result).toEqual({ circuitState: "half_open", stateChanged: false }); }); it("falls back to EVAL on NOSCRIPT error", async () => { mockSendCommand .mockRejectedValueOnce(new Error("NOSCRIPT No matching script")) - .mockResolvedValueOnce([1, "ok"]); + .mockResolvedValueOnce(["closed", 0]); const result = await recordResult( mockRedis, @@ -269,12 +272,12 @@ describe("recordResult", () => { defaultConfig, ); - expect(result).toEqual({ ok: true, state: "ok" }); + expect(result).toEqual({ circuitState: "closed", stateChanged: false }); expect(mockSendCommand).toHaveBeenCalledTimes(2); }); it("passes correct ep key for target", async () => { - mockSendCommand.mockResolvedValueOnce([1, "ok"]); + mockSendCommand.mockResolvedValueOnce(["closed", 0]); await recordResult(mockRedis, "my-target", 1, 0, defaultConfig); @@ -283,7 +286,7 @@ describe("recordResult", () => { }); it("passes consumedTokens and processingFailures as ARGV", async () => { - mockSendCommand.mockResolvedValueOnce([1, "ok"]); + mockSendCommand.mockResolvedValueOnce(["closed", 0]); await recordResult(mockRedis, "target-1", 8, 3, defaultConfig); diff --git a/lambdas/https-client-lambda/src/__tests__/handler.test.ts b/lambdas/https-client-lambda/src/__tests__/handler.test.ts index f69d1d51..cfdc57e0 100644 --- a/lambdas/https-client-lambda/src/__tests__/handler.test.ts +++ b/lambdas/https-client-lambda/src/__tests__/handler.test.ts @@ -473,7 +473,7 @@ describe("processRecords", () => { expect(mockRecordResult).not.toHaveBeenCalled(); }); - it("records CircuitBreakerOpen when recordResult returns opened", async () => { + it("records CircuitBreakerOpen when recordResult indicates circuit opened", async () => { const targetCb = { ...DEFAULT_TARGET, delivery: { circuitBreaker: { enabled: true } }, @@ -483,7 +483,10 @@ describe("processRecords", () => { outcome: "transient_failure", statusCode: 503, }); - mockRecordResult.mockResolvedValue({ ok: false, state: "opened" }); + mockRecordResult.mockResolvedValue({ + circuitState: "open", + stateChanged: true, + }); const { recordCircuitBreakerOpen } = jest.requireMock( "services/delivery-observability", @@ -494,7 +497,7 @@ describe("processRecords", () => { expect(recordCircuitBreakerOpen).toHaveBeenCalledWith("target-1"); }); - it("does not record CircuitBreakerOpen when recordResult returns failed", async () => { + it("does not record CircuitBreakerOpen when recordResult has no state change", async () => { const targetCb = { ...DEFAULT_TARGET, delivery: { circuitBreaker: { enabled: true } }, @@ -504,7 +507,10 @@ describe("processRecords", () => { outcome: "transient_failure", statusCode: 503, }); - mockRecordResult.mockResolvedValue({ ok: false, state: "failed" }); + mockRecordResult.mockResolvedValue({ + circuitState: "open", + stateChanged: false, + }); const { recordCircuitBreakerOpen } = jest.requireMock( "services/delivery-observability", @@ -515,7 +521,7 @@ describe("processRecords", () => { expect(recordCircuitBreakerOpen).not.toHaveBeenCalled(); }); - it("does not record CircuitBreakerOpen when recordResult returns ok", async () => { + it("does not record CircuitBreakerOpen when circuit is closed", async () => { const targetCb = { ...DEFAULT_TARGET, delivery: { circuitBreaker: { enabled: true } }, @@ -525,7 +531,10 @@ describe("processRecords", () => { outcome: "transient_failure", statusCode: 503, }); - mockRecordResult.mockResolvedValue({ ok: true, state: "ok" }); + mockRecordResult.mockResolvedValue({ + circuitState: "closed", + stateChanged: false, + }); const { recordCircuitBreakerOpen } = jest.requireMock( "services/delivery-observability", @@ -536,7 +545,7 @@ describe("processRecords", () => { expect(recordCircuitBreakerOpen).not.toHaveBeenCalled(); }); - it("records CircuitBreakerClosed when recordResult returns closed", async () => { + it("records CircuitBreakerClosed when recordResult indicates circuit closed", async () => { const targetCb = { ...DEFAULT_TARGET, delivery: { circuitBreaker: { enabled: true } }, @@ -546,7 +555,10 @@ describe("processRecords", () => { outcome: "success", statusCode: 200, }); - mockRecordResult.mockResolvedValue({ ok: true, state: "closed" }); + mockRecordResult.mockResolvedValue({ + circuitState: "closed_recovery", + stateChanged: true, + }); const { recordCircuitBreakerClosed } = jest.requireMock( "services/delivery-observability", diff --git a/lambdas/https-client-lambda/src/__tests__/record-result-lua.test.ts b/lambdas/https-client-lambda/src/__tests__/record-result-lua.test.ts index 48495de1..00e04707 100644 --- a/lambdas/https-client-lambda/src/__tests__/record-result-lua.test.ts +++ b/lambdas/https-client-lambda/src/__tests__/record-result-lua.test.ts @@ -3,7 +3,9 @@ import { createRedisStore, evalLua } from "__tests__/helpers/lua-redis-mock"; // ARGV: [now, consumedTokens, processingFailures, cooldownPeriodMs, recoveryPeriodMs, failureThreshold, minAttempts, samplePeriodMs] // KEYS: [epKey] -// Returns: [ok (0|1), state] state: "ok" | "closed" | "opened" | "failed" +// Returns: [circuitState, stateChanged] +// circuitState: "open" | "half_open" | "closed_recovery" | "closed" +// stateChanged: 0 | 1 type RecordResultArgs = { now: number; @@ -27,7 +29,7 @@ const defaultArgs: RecordResultArgs = { samplePeriodMs: 300_000, }; -type RecordResultResult = [number, string]; +type RecordResultResult = [string, number]; function runRecordResult( store: ReturnType, @@ -54,22 +56,36 @@ function runRecordResult( describe("record-result.lua", () => { describe("success recording", () => { - it("returns ok state for a successful batch with no state change", () => { + it("returns closed state for a successful batch with no state change", () => { const store = createRedisStore(); - store.set("ep:t1", new Map([["sample_till", "9999999999"]])); + store.set( + "ep:t1", + new Map([ + ["is_open", "0"], + ["switched_at", "0"], + ["sample_till", "9999999999"], + ]), + ); - const [ok, state] = runRecordResult(store, { + const [circuitState, stateChanged] = runRecordResult(store, { consumedTokens: 5, processingFailures: 0, }); - expect(ok).toBe(1); - expect(state).toBe("ok"); + expect(circuitState).toBe("closed"); + expect(stateChanged).toBe(0); }); it("increments cur_attempts without incrementing cur_failures", () => { const store = createRedisStore(); - store.set("ep:t1", new Map([["sample_till", "9999999999"]])); + store.set( + "ep:t1", + new Map([ + ["is_open", "0"], + ["switched_at", "0"], + ["sample_till", "9999999999"], + ]), + ); runRecordResult(store, { consumedTokens: 3, processingFailures: 0 }); @@ -82,7 +98,14 @@ describe("record-result.lua", () => { describe("failure recording", () => { it("increments both cur_attempts and cur_failures", () => { const store = createRedisStore(); - store.set("ep:t1", new Map([["sample_till", "9999999999"]])); + store.set( + "ep:t1", + new Map([ + ["is_open", "0"], + ["switched_at", "0"], + ["sample_till", "9999999999"], + ]), + ); runRecordResult(store, { consumedTokens: 5, processingFailures: 1 }); @@ -91,17 +114,24 @@ describe("record-result.lua", () => { expect(epHash.get("cur_failures")).toBe("1"); }); - it("returns failed state for failures below threshold", () => { + it("returns closed state for failures below threshold", () => { const store = createRedisStore(); - store.set("ep:t1", new Map([["sample_till", "9999999999"]])); + store.set( + "ep:t1", + new Map([ + ["is_open", "0"], + ["switched_at", "0"], + ["sample_till", "9999999999"], + ]), + ); - const [ok, state] = runRecordResult(store, { + const [circuitState, stateChanged] = runRecordResult(store, { consumedTokens: 1, processingFailures: 1, }); - expect(ok).toBe(0); - expect(state).toBe("failed"); + expect(circuitState).toBe("closed"); + expect(stateChanged).toBe(0); }); }); @@ -134,7 +164,7 @@ describe("record-result.lua", () => { expect(epHash.get("cur_failures")).toBe("0"); }); - it("returns failed when circuit is fully open and state unchanged", () => { + it("returns open when circuit is fully open and state unchanged", () => { const store = createRedisStore(); const now = 1_000_000; const switchedAt = now - 10_000; @@ -148,51 +178,72 @@ describe("record-result.lua", () => { ]), ); - const [ok, state] = runRecordResult(store, { + const [circuitState, stateChanged] = runRecordResult(store, { now, cooldownPeriodMs: 120_000, consumedTokens: 1, processingFailures: 0, }); - expect(ok).toBe(0); - expect(state).toBe("failed"); + expect(circuitState).toBe("open"); + expect(stateChanged).toBe(0); }); }); describe("circuit opening", () => { it("opens circuit when failure rate exceeds threshold", () => { const store = createRedisStore(); - store.set("ep:t1", new Map([["sample_till", "9999999999"]])); + store.set( + "ep:t1", + new Map([ + ["is_open", "0"], + ["switched_at", "0"], + ["sample_till", "9999999999"], + ]), + ); - const [ok, state] = runRecordResult(store, { + const [circuitState, stateChanged] = runRecordResult(store, { consumedTokens: 5, processingFailures: 5, minAttempts: 5, failureThreshold: 0.3, }); - expect(ok).toBe(0); - expect(state).toBe("opened"); + expect(circuitState).toBe("open"); + expect(stateChanged).toBe(1); }); it("does not open circuit when below minimum attempts", () => { const store = createRedisStore(); - store.set("ep:t1", new Map([["sample_till", "9999999999"]])); + store.set( + "ep:t1", + new Map([ + ["is_open", "0"], + ["switched_at", "0"], + ["sample_till", "9999999999"], + ]), + ); - const [ok, state] = runRecordResult(store, { + const [circuitState, stateChanged] = runRecordResult(store, { consumedTokens: 3, processingFailures: 3, minAttempts: 5, failureThreshold: 0.3, }); - expect(ok).toBe(0); - expect(state).toBe("failed"); + expect(circuitState).toBe("closed"); + expect(stateChanged).toBe(0); }); it("sets is_open and switched_at on open", () => { const store = createRedisStore(); const now = 1_000_000; - store.set("ep:t1", new Map([["sample_till", "9999999999"]])); + store.set( + "ep:t1", + new Map([ + ["is_open", "0"], + ["switched_at", "0"], + ["sample_till", "9999999999"], + ]), + ); runRecordResult(store, { now, @@ -211,7 +262,14 @@ describe("record-result.lua", () => { const store = createRedisStore(); const now = 1_000_000; const samplePeriodMs = 300_000; - store.set("ep:t1", new Map([["sample_till", "9999999999"]])); + store.set( + "ep:t1", + new Map([ + ["is_open", "0"], + ["switched_at", "0"], + ["sample_till", "9999999999"], + ]), + ); runRecordResult(store, { now, @@ -246,15 +304,15 @@ describe("record-result.lua", () => { ]), ); - const [ok, state] = runRecordResult(store, { + const [circuitState, stateChanged] = runRecordResult(store, { now, cooldownPeriodMs: 120_000, consumedTokens: 1, processingFailures: 0, }); - expect(ok).toBe(1); - expect(state).toBe("closed"); + expect(circuitState).toBe("closed_recovery"); + expect(stateChanged).toBe(1); const epHash = store.get("ep:t1")!; expect(epHash.get("is_open")).toBe("0"); @@ -275,15 +333,15 @@ describe("record-result.lua", () => { ]), ); - const [ok, state] = runRecordResult(store, { + const [circuitState, stateChanged] = runRecordResult(store, { now, cooldownPeriodMs: 120_000, consumedTokens: 1, processingFailures: 1, }); - expect(ok).toBe(0); - expect(state).toBe("failed"); + expect(circuitState).toBe("half_open"); + expect(stateChanged).toBe(0); }); }); @@ -297,6 +355,8 @@ describe("record-result.lua", () => { store.set( "ep:t1", new Map([ + ["is_open", "0"], + ["switched_at", "0"], ["sample_till", sampleTill.toString()], ["cur_attempts", "10"], ["cur_failures", "3"], @@ -324,6 +384,8 @@ describe("record-result.lua", () => { store.set( "ep:t1", new Map([ + ["is_open", "0"], + ["switched_at", "0"], ["sample_till", sampleTill.toString()], ["cur_attempts", "10"], ["cur_failures", "3"], @@ -349,6 +411,8 @@ describe("record-result.lua", () => { store.set( "ep:t1", new Map([ + ["is_open", "0"], + ["switched_at", "0"], ["sample_till", sampleTill.toString()], ["prev_attempts", "10"], ["prev_failures", "10"], @@ -359,7 +423,7 @@ describe("record-result.lua", () => { // interpolated attempts = 10 * 1.0 + 5 = 15 (>= minAttempts 5) // interpolated failures = 10 * 1.0 + 5 = 15 // failure rate = 15/15 = 1.0 > 0.3 → opens - const [ok, state] = runRecordResult(store, { + const [circuitState, stateChanged] = runRecordResult(store, { now, samplePeriodMs, consumedTokens: 5, @@ -367,15 +431,22 @@ describe("record-result.lua", () => { minAttempts: 5, failureThreshold: 0.3, }); - expect(ok).toBe(0); - expect(state).toBe("opened"); + expect(circuitState).toBe("open"); + expect(stateChanged).toBe(1); }); }); describe("state persistence", () => { it("writes all sampling fields to redis", () => { const store = createRedisStore(); - store.set("ep:t1", new Map([["sample_till", "9999999999"]])); + store.set( + "ep:t1", + new Map([ + ["is_open", "0"], + ["switched_at", "0"], + ["sample_till", "9999999999"], + ]), + ); runRecordResult(store); const epHash = store.get("ep:t1")!; diff --git a/lambdas/https-client-lambda/src/handler.ts b/lambdas/https-client-lambda/src/handler.ts index 764e7397..72a871e7 100644 --- a/lambdas/https-client-lambda/src/handler.ts +++ b/lambdas/https-client-lambda/src/handler.ts @@ -262,10 +262,13 @@ async function processTargetBatch( processingFailures, gateConfig, ); - if (!cbOutcome.ok && cbOutcome.state === "opened") { + if (cbOutcome.stateChanged && cbOutcome.circuitState === "open") { recordCircuitBreakerOpen(batch.targetId); } - if (cbOutcome.ok && cbOutcome.state === "closed") { + if ( + cbOutcome.stateChanged && + cbOutcome.circuitState === "closed_recovery" + ) { recordCircuitBreakerClosed(batch.targetId); } } diff --git a/lambdas/https-client-lambda/src/services/admit.lua b/lambdas/https-client-lambda/src/services/admit.lua index 36809e40..42e41d07 100644 --- a/lambdas/https-client-lambda/src/services/admit.lua +++ b/lambdas/https-client-lambda/src/services/admit.lua @@ -30,12 +30,12 @@ local targetBatchSize = tonumber(ARGV[7]) or 0 local state = redis.call("HMGET", epKey, "is_open", "switched_at", "bucket_tokens", "bucket_refilled_at") -local isOpen = tonumber(state[1] or "0") == 1 -local switchedAtRaw = state[2] -local switchedAt = tonumber(switchedAtRaw or tostring(now)) +local isOpenRaw = state[1] +local needInit = isOpenRaw == false or isOpenRaw == nil +local isOpen = needInit or tonumber(isOpenRaw) == 1 +local switchedAt = needInit and 0 or tonumber(state[2] or "0") local bucketTokens = tonumber(state[3] or "0") -local bucketRefilledAt = tonumber(state[4] or "0") -local needInitSwitchedAt = switchedAtRaw == false or switchedAtRaw == nil +local bucketRefilledAt = needInit and now or tonumber(state[4] or "0") -------------------------------------------------------------------------------- -- 1. CIRCUIT BREAKER — determine effective rate @@ -54,7 +54,9 @@ if isOpen then end else if isRecovering then - effectiveRate = targetRateLimit * (now - switchedAt) / recoveryPeriodMs + local rampRange = math.max(0, targetRateLimit - probeRateLimit) + local rampProgress = math.max(0, now - switchedAt) / recoveryPeriodMs + effectiveRate = probeRateLimit + rampProgress * rampRange else effectiveRate = targetRateLimit end @@ -70,6 +72,10 @@ end -- generate the whole tokens (not set to `now`), preserving fractional time. -------------------------------------------------------------------------------- +if isOpen then + bucketTokens = 0 +end + local generatedTokens = math.floor((now - bucketRefilledAt) * effectiveRate / 1000) local availTokens = math.min(capacity, bucketTokens + generatedTokens) local consumedTokens = math.min(targetBatchSize, availTokens) @@ -89,10 +95,6 @@ redis.call("HSET", epKey, "bucket_refilled_at", bucketRefilledAt ) -if needInitSwitchedAt then - redis.call("HSET", epKey, "switched_at", switchedAt) -end - local reason = consumedTokens < 1 and "rate_limited" or "allowed" local retryAfter = consumedTokens < 1 and 1000 or 0 return { consumedTokens, reason, retryAfter, effectiveRate } diff --git a/lambdas/https-client-lambda/src/services/endpoint-gate.ts b/lambdas/https-client-lambda/src/services/endpoint-gate.ts index 8a3b9089..8870e5d8 100644 --- a/lambdas/https-client-lambda/src/services/endpoint-gate.ts +++ b/lambdas/https-client-lambda/src/services/endpoint-gate.ts @@ -18,9 +18,12 @@ export type AdmitResultDenied = { export type AdmitResult = AdmitResultAllowed | AdmitResultDenied; -export type RecordResultOutcome = - | { ok: true; state: "closed" | "ok" } - | { ok: false; state: "opened" | "failed" }; +export type CircuitState = "open" | "half_open" | "closed_recovery" | "closed"; + +export type RecordResultOutcome = { + circuitState: CircuitState; + stateChanged: boolean; +}; export type EndpointGateConfig = { burstCapacity: number; @@ -154,15 +157,14 @@ export async function recordResult( recordResultSha, [epKey], args, - )) as [number, string]; - - const [ok, state] = raw; + )) as [string, number]; - if (ok === 1) { - return { ok: true, state: state as "closed" | "ok" }; - } + const [circuitState, stateChanged] = raw; - return { ok: false, state: state as "opened" | "failed" }; + return { + circuitState: circuitState as CircuitState, + stateChanged: stateChanged === 1, + }; } export function resetAdmitSha(): void { diff --git a/lambdas/https-client-lambda/src/services/record-result.lua b/lambdas/https-client-lambda/src/services/record-result.lua index c8c6a0a6..fa42efea 100644 --- a/lambdas/https-client-lambda/src/services/record-result.lua +++ b/lambdas/https-client-lambda/src/services/record-result.lua @@ -7,14 +7,23 @@ -- 4. Checks whether to close the circuit (half-open + successes) -- 5. Checks whether to open the circuit (closed + threshold exceeded) -- --- Returns: { ok (0|1), state } --- state: "closed" | "opened" | "failed" +-- Returns: { circuitState, stateChanged } +-- +-- circuitState: the current state of the circuit after this run +-- "open" — fully open (during cooldown, no probes) +-- "half_open" — open but past cooldown (probing) +-- "closed_recovery" — closed but ramping up (recovery period) +-- "closed" — closed, running at full rate +-- +-- stateChanged: whether a circuit transition occurred this run +-- 1 — the circuit opened or closed during this execution +-- 0 — no state transition --- Return state constants -local OPENED = "opened" -local CLOSED = "closed" -local FAILED = "failed" -local OK = "ok" +-- Circuit state constants +local OPEN = "open" +local HALF_OPEN = "half_open" +local CLOSED_RECOVERY = "closed_recovery" +local CLOSED = "closed" -- Keys local epKey = KEYS[1] -- ep:{targetId} combined endpoint state hash @@ -24,7 +33,7 @@ local now = tonumber(ARGV[1]) or 0 local consumedTokens = tonumber(ARGV[2]) or 0 local processingFailures = tonumber(ARGV[3]) or 0 local cooldownPeriodMs = tonumber(ARGV[4]) or 0 -local _recoveryPeriodMs = tonumber(ARGV[5]) or 0 -- luacheck: ignore +local recoveryPeriodMs = tonumber(ARGV[5]) or 0 local failureThreshold = tonumber(ARGV[6]) or 0 local minAttempts = tonumber(ARGV[7]) or 0 local samplePeriodMs = tonumber(ARGV[8]) or 0 @@ -37,8 +46,10 @@ local state = redis.call("HMGET", epKey, "is_open", "switched_at", "cur_attempts", "prev_attempts", "cur_failures", "prev_failures", "sample_till") -local isOpen = tonumber(state[1] or "0") == 1 -local switchedAt = tonumber(state[2] or tostring(now)) +local isOpenRaw = state[1] +local needInit = isOpenRaw == false or isOpenRaw == nil +local isOpen = needInit or tonumber(isOpenRaw) == 1 +local switchedAt = needInit and 0 or tonumber(state[2] or "0") local curAttempts = tonumber(state[3] or "0") local prevAttempts = tonumber(state[4] or "0") local curFailures = tonumber(state[5] or "0") @@ -118,7 +129,26 @@ if not isOpen and hasSampledEnough and (failures / attempts) > failureThreshold end -------------------------------------------------------------------------------- --- 6. PERSIST STATE +-- 6. DETERMINE CURRENT CIRCUIT STATE FOR REPORTING +-------------------------------------------------------------------------------- + +local circuitState +if isOpen then + if now > switchedAt + cooldownPeriodMs then + circuitState = HALF_OPEN + else + circuitState = OPEN + end +else + if now < switchedAt + recoveryPeriodMs then + circuitState = CLOSED_RECOVERY + else + circuitState = CLOSED + end +end + +-------------------------------------------------------------------------------- +-- 7. PERSIST STATE -------------------------------------------------------------------------------- redis.call("HSET", epKey, @@ -136,16 +166,4 @@ if stateChanged then ) end -if stateChanged and isOpen then - return { 0, OPENED } -end - -if stateChanged and not isOpen then - return { 1, CLOSED } -end - -if isOpen or processingFailures > 0 then - return { 0, FAILED } -end - -return { 1, OK } +return { circuitState, stateChanged and 1 or 0 } From be5d8bce4d600c89fa759080d9713caf3330c573 Mon Sep 17 00:00:00 2001 From: Rhys Cox Date: Tue, 28 Apr 2026 13:21:49 +0100 Subject: [PATCH 5/9] CCM-16073 - Updated rate limiting behaviour --- .../src/__tests__/handler.test.ts | 8 ++++++-- lambdas/https-client-lambda/src/handler.ts | 6 ++++-- lambdas/https-client-lambda/src/services/admit.lua | 12 ++++++++++-- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/lambdas/https-client-lambda/src/__tests__/handler.test.ts b/lambdas/https-client-lambda/src/__tests__/handler.test.ts index cfdc57e0..ab5dff46 100644 --- a/lambdas/https-client-lambda/src/__tests__/handler.test.ts +++ b/lambdas/https-client-lambda/src/__tests__/handler.test.ts @@ -336,7 +336,9 @@ describe("processRecords", () => { const failures = await processRecords([makeRecord()]); expect(failures).toEqual([{ itemIdentifier: "msg-1" }]); - expect(mockChangeVisibility).toHaveBeenCalledWith("receipt-1", 2); + const visibilityDelay = mockChangeVisibility.mock.calls[0]![1] as number; + expect(visibilityDelay).toBeGreaterThanOrEqual(2); + expect(visibilityDelay).toBeLessThanOrEqual(6); expect(mockSendToDlq).not.toHaveBeenCalled(); expect(mockDeliverPayload).not.toHaveBeenCalled(); }); @@ -352,7 +354,9 @@ describe("processRecords", () => { const failures = await processRecords([makeRecord()]); expect(failures).toEqual([{ itemIdentifier: "msg-1" }]); - expect(mockChangeVisibility).toHaveBeenCalledWith("receipt-1", 30); + const visibilityDelay = mockChangeVisibility.mock.calls[0]![1] as number; + expect(visibilityDelay).toBeGreaterThanOrEqual(30); + expect(visibilityDelay).toBeLessThanOrEqual(34); expect(mockSendToDlq).not.toHaveBeenCalled(); expect(mockDeliverPayload).not.toHaveBeenCalled(); }); diff --git a/lambdas/https-client-lambda/src/handler.ts b/lambdas/https-client-lambda/src/handler.ts index 72a871e7..b129fc8d 100644 --- a/lambdas/https-client-lambda/src/handler.ts +++ b/lambdas/https-client-lambda/src/handler.ts @@ -199,11 +199,13 @@ async function processTargetBatch( ); if (!gateResult.allowed) { - const delaySec = Math.ceil(gateResult.retryAfterMs / 1000); + const baseDelaySec = Math.ceil(gateResult.retryAfterMs / 1000); recordAdmissionDenied(clientId, batch.targetId, gateResult.reason); const failures: SQSBatchItemFailure[] = []; for (const record of batch.records) { - await changeVisibility(record.receiptHandle, delaySec); + // eslint-disable-next-line sonarjs/pseudo-random -- jitter for backoff, not security-sensitive + const jitterSec = Math.floor(Math.random() * 5); + await changeVisibility(record.receiptHandle, baseDelaySec + jitterSec); failures.push({ itemIdentifier: record.messageId }); } return failures; diff --git a/lambdas/https-client-lambda/src/services/admit.lua b/lambdas/https-client-lambda/src/services/admit.lua index 42e41d07..53b6977c 100644 --- a/lambdas/https-client-lambda/src/services/admit.lua +++ b/lambdas/https-client-lambda/src/services/admit.lua @@ -68,8 +68,16 @@ end -- Generate tokens based on elapsed time, then consume as many as needed for -- the batch, up to the number available. -- --- Refill precision: bucketRefilledAt advances by exactly the time required to --- generate the whole tokens (not set to `now`), preserving fractional time. +-- bucketRefilledAt tracks the point in time up to which tokens have been +-- generated. We advance it by exactly the time needed to produce the whole +-- tokens we generated (generationTime), rather than setting it to `now`. +-- +-- Why not `now`? Token generation uses floor(), so any sub-token fractional +-- time is truncated. Setting bucketRefilledAt = now would discard that +-- remainder, meaning the next call starts its elapsed-time calculation from +-- a later point than it should. Over many calls this causes token leakage — +-- the bucket refills slower than the configured rate. By advancing only by +-- generationTime, the leftover fractional time carries over to the next call. -------------------------------------------------------------------------------- if isOpen then From 5e671d003db58b8854cb85cb38f4c211b0db4e0b Mon Sep 17 00:00:00 2001 From: Rhys Cox Date: Wed, 29 Apr 2026 09:03:48 +0100 Subject: [PATCH 6/9] CCM-16073 - Split clients infra into its own component --- .github/workflows/cicd-1-pull-request.yaml | 28 ++++- .github/workflows/pr_closed.yml | 2 +- .github/workflows/pr_destroy_dynamic_env.yml | 16 +++ .github/workflows/release_created.yml | 2 +- .../callbacks-clients/.tool-versions | 1 + .../components/callbacks-clients/README.md | 43 +++++++ .../components/callbacks-clients/_paths.sh | 8 ++ .../components/callbacks-clients/locals.tf | 57 ++++++++++ .../callbacks-clients/locals_remote_state.tf | 59 ++++++++++ .../callbacks-clients/locals_tfscaffold.tf | 44 ++++++++ .../module_client_delivery.tf | 48 ++++++++ .../components/callbacks-clients/outputs.tf | 15 +++ .../components/callbacks-clients/pre.sh | 40 +++++++ .../callbacks-clients/provider_aws.tf | 11 ++ .../s3_object_client_config.tf | 12 ++ .../ssm_parameter_applications_map.tf | 4 +- .../callbacks-clients/sync-client-config.sh | 48 ++++++++ .../components/callbacks-clients/variables.tf | 96 ++++++++++++++++ .../components/callbacks-clients/versions.tf | 14 +++ .../terraform/components/callbacks/README.md | 22 +++- .../terraform/components/callbacks/locals.tf | 55 --------- .../callbacks/module_client_delivery.tf | 48 -------- .../callbacks/module_perf_runner_lambda.tf | 9 +- .../terraform/components/callbacks/outputs.tf | 105 ++++++++++++++++++ .../terraform/components/callbacks/pre.sh | 31 +----- .../callbacks/s3_bucket_client_config.tf | 13 --- .../components/callbacks/variables.tf | 6 - .../src/__tests__/index.test.ts | 9 +- .../src/__tests__/purge.test.ts | 38 +++++-- .../src/__tests__/runner.test.ts | 18 ++- lambdas/perf-runner-lambda/src/index.ts | 2 + lambdas/perf-runner-lambda/src/purge.ts | 10 +- lambdas/perf-runner-lambda/src/runner.ts | 6 +- lambdas/perf-runner-lambda/src/types.ts | 1 + scripts/tests/integration-debug.sh | 11 +- 35 files changed, 743 insertions(+), 189 deletions(-) create mode 100644 infrastructure/terraform/components/callbacks-clients/.tool-versions create mode 100644 infrastructure/terraform/components/callbacks-clients/README.md create mode 100644 infrastructure/terraform/components/callbacks-clients/_paths.sh create mode 100644 infrastructure/terraform/components/callbacks-clients/locals.tf create mode 100644 infrastructure/terraform/components/callbacks-clients/locals_remote_state.tf create mode 100644 infrastructure/terraform/components/callbacks-clients/locals_tfscaffold.tf create mode 100644 infrastructure/terraform/components/callbacks-clients/module_client_delivery.tf create mode 100644 infrastructure/terraform/components/callbacks-clients/outputs.tf create mode 100644 infrastructure/terraform/components/callbacks-clients/pre.sh create mode 100644 infrastructure/terraform/components/callbacks-clients/provider_aws.tf create mode 100644 infrastructure/terraform/components/callbacks-clients/s3_object_client_config.tf rename infrastructure/terraform/components/{callbacks => callbacks-clients}/ssm_parameter_applications_map.tf (83%) create mode 100644 infrastructure/terraform/components/callbacks-clients/sync-client-config.sh create mode 100644 infrastructure/terraform/components/callbacks-clients/variables.tf create mode 100644 infrastructure/terraform/components/callbacks-clients/versions.tf delete mode 100644 infrastructure/terraform/components/callbacks/module_client_delivery.tf diff --git a/.github/workflows/cicd-1-pull-request.yaml b/.github/workflows/cicd-1-pull-request.yaml index aa5a82bf..d293f0bd 100644 --- a/.github/workflows/cicd-1-pull-request.yaml +++ b/.github/workflows/cicd-1-pull-request.yaml @@ -173,12 +173,36 @@ jobs: --overrideProjectName "nhs" \ --overrideRoleName "nhs-main-acct-client-callbacks-github-deploy" \ --overrides "branch_name=${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}" + pr-create-dynamic-environment-clients: + name: Create Dynamic Environment (clients) + needs: [metadata, pr-create-dynamic-environment] + runs-on: ubuntu-latest + if: needs.metadata.outputs.does_pull_request_exist == 'true' && github.ref != 'refs/heads/main' + env: + APP_CLIENT_ID: ${{ secrets.APP_CLIENT_ID }} + APP_PEM_FILE: ${{ secrets.APP_PEM_FILE }} + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + - name: Trigger dynamic environment creation (callbacks-clients) + shell: bash + run: | + .github/scripts/dispatch_internal_repo_workflow.sh \ + --infraRepoName "$(echo ${{ github.repository }} | cut -d'/' -f2)" \ + --releaseVersion "${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}" \ + --targetWorkflow "dispatch-deploy-dynamic-env.yaml" \ + --targetEnvironment "pr${{ needs.metadata.outputs.pr_number }}" \ + --targetComponent "callbacks-clients" \ + --targetAccountGroup "nhs-notify-client-callbacks-dev" \ + --terraformAction "apply" \ + --overrideProjectName "nhs" \ + --overrideRoleName "nhs-main-acct-client-callbacks-github-deploy" \ + --overrides "branch_name=${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}" acceptance-stage: # Recommended maximum execution time is 10 minutes name: "Acceptance stage" - needs: [metadata, build-stage, pr-create-dynamic-environment] + needs: [metadata, build-stage, pr-create-dynamic-environment, pr-create-dynamic-environment-clients] uses: ./.github/workflows/stage-4-acceptance.yaml if: >- - contains(fromJSON('["success", "skipped"]'), needs.pr-create-dynamic-environment.result) && + contains(fromJSON('["success", "skipped"]'), needs.pr-create-dynamic-environment-clients.result) && (needs.metadata.outputs.does_pull_request_exist == 'true' || (github.event_name == 'pull_request' && (github.event.action == 'opened' || github.event.action == 'reopened')) || (github.event_name == 'push' && github.ref == 'refs/heads/main')) with: build_datetime: "${{ needs.metadata.outputs.build_datetime }}" diff --git a/.github/workflows/pr_closed.yml b/.github/workflows/pr_closed.yml index 42e61428..bdcfc4b4 100644 --- a/.github/workflows/pr_closed.yml +++ b/.github/workflows/pr_closed.yml @@ -46,7 +46,7 @@ jobs: strategy: max-parallel: 1 matrix: - component: [callbacks] + component: [callbacks, callbacks-clients] steps: - name: Checkout repository diff --git a/.github/workflows/pr_destroy_dynamic_env.yml b/.github/workflows/pr_destroy_dynamic_env.yml index 67abd292..7aeb3d1f 100644 --- a/.github/workflows/pr_destroy_dynamic_env.yml +++ b/.github/workflows/pr_destroy_dynamic_env.yml @@ -19,6 +19,22 @@ jobs: steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + - name: Trigger dynamic environment destroy (callbacks-clients) + env: + APP_PEM_FILE: ${{ secrets.APP_PEM_FILE }} + APP_CLIENT_ID: ${{ secrets.APP_CLIENT_ID }} + shell: bash + run: | + .github/scripts/dispatch_internal_repo_workflow.sh \ + --infraRepoName "$(echo ${{ github.repository }} | cut -d'/' -f2)" \ + --releaseVersion "${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}" \ + --targetWorkflow "dispatch-deploy-dynamic-env.yaml" \ + --targetEnvironment "pr${{ github.event.number }}" \ + --targetComponent "callbacks-clients" \ + --targetAccountGroup "nhs-notify-client-callbacks-dev" \ + --terraformAction "destroy" \ + --overrideProjectName "nhs" \ + --overrideRoleName "nhs-main-acct-client-callbacks-github-deploy" \ - name: Trigger dynamic environment destroy env: APP_PEM_FILE: ${{ secrets.APP_PEM_FILE }} diff --git a/.github/workflows/release_created.yml b/.github/workflows/release_created.yml index 329282ae..eb18f897 100644 --- a/.github/workflows/release_created.yml +++ b/.github/workflows/release_created.yml @@ -22,7 +22,7 @@ jobs: strategy: max-parallel: 1 matrix: - component: [callbacks] + component: [callbacks, callbacks-clients] steps: - name: Checkout repository diff --git a/infrastructure/terraform/components/callbacks-clients/.tool-versions b/infrastructure/terraform/components/callbacks-clients/.tool-versions new file mode 100644 index 00000000..3dd74c72 --- /dev/null +++ b/infrastructure/terraform/components/callbacks-clients/.tool-versions @@ -0,0 +1 @@ +terraform 1.10.1 diff --git a/infrastructure/terraform/components/callbacks-clients/README.md b/infrastructure/terraform/components/callbacks-clients/README.md new file mode 100644 index 00000000..2f5580bb --- /dev/null +++ b/infrastructure/terraform/components/callbacks-clients/README.md @@ -0,0 +1,43 @@ + + + + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 1.10.1 | +| [aws](#requirement\_aws) | 6.13 | +| [random](#requirement\_random) | ~> 3.0 | +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [aws\_account\_id](#input\_aws\_account\_id) | The AWS Account ID (numeric) | `string` | n/a | yes | +| [component](#input\_component) | The variable encapsulating the name of this component | `string` | `"callbacks-clients"` | no | +| [default\_tags](#input\_default\_tags) | A map of default tags to apply to all taggable resources within the component | `map(string)` | `{}` | no | +| [deploy\_mock\_clients](#input\_deploy\_mock\_clients) | Flag to deploy mock client subscription config for integration testing (test/dev environments only) | `bool` | `false` | no | +| [deploy\_perf\_runner](#input\_deploy\_perf\_runner) | Flag to deploy performance test client subscription fixtures | `bool` | `false` | no | +| [enable\_xray\_tracing](#input\_enable\_xray\_tracing) | Enable AWS X-Ray active tracing for Lambda functions | `bool` | `false` | no | +| [environment](#input\_environment) | The name of the tfscaffold environment | `string` | n/a | yes | +| [force\_lambda\_code\_deploy](#input\_force\_lambda\_code\_deploy) | If the lambda package in s3 has the same commit id tag as the terraform build branch, the lambda will not update automatically. Set to True if making changes to Lambda code from on the same commit for example during development | `bool` | `false` | no | +| [group](#input\_group) | The group variables are being inherited from (often synonmous with account short-name) | `string` | n/a | yes | +| [log\_level](#input\_log\_level) | The log level to be used in lambda functions within the component | `string` | `"INFO"` | no | +| [log\_retention\_in\_days](#input\_log\_retention\_in\_days) | The retention period in days for the Cloudwatch Logs events to be retained, default of 0 is indefinite | `number` | `0` | no | +| [parent\_acct\_environment](#input\_parent\_acct\_environment) | Name of the environment responsible for the acct resources used, affects things like DNS zone. Useful for named dev environments | `string` | `"main"` | no | +| [project](#input\_project) | The name of the tfscaffold project | `string` | n/a | yes | +| [region](#input\_region) | The AWS Region | `string` | n/a | yes | +| [token\_bucket\_burst\_capacity](#input\_token\_bucket\_burst\_capacity) | Token bucket burst capacity used by the rate limiter | `number` | `2250` | no | +## Modules + +| Name | Source | Version | +|------|--------|---------| +| [client\_delivery](#module\_client\_delivery) | ../../modules/client-delivery | n/a | +## Outputs + +| Name | Description | +|------|-------------| +| [deployment](#output\_deployment) | Deployment details used for post-deployment scripts | + + + diff --git a/infrastructure/terraform/components/callbacks-clients/_paths.sh b/infrastructure/terraform/components/callbacks-clients/_paths.sh new file mode 100644 index 00000000..9b9aba00 --- /dev/null +++ b/infrastructure/terraform/components/callbacks-clients/_paths.sh @@ -0,0 +1,8 @@ +_paths_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(cd "${_paths_dir}/../../../.." && pwd)" +clients_dir="${repo_root}/infrastructure/terraform/modules/clients" + +# Follow symlinks to find the real nhs-notify-client-callbacks root +# (repo_root resolves to the workspace root, which differs in CI where the component is symlinked in) +_real_script="$(readlink -f "${BASH_SOURCE[0]}")" +bounded_context_root="$(cd "$(dirname "${_real_script}")/../../../.." && pwd)" diff --git a/infrastructure/terraform/components/callbacks-clients/locals.tf b/infrastructure/terraform/components/callbacks-clients/locals.tf new file mode 100644 index 00000000..ee33b2ba --- /dev/null +++ b/infrastructure/terraform/components/callbacks-clients/locals.tf @@ -0,0 +1,57 @@ +locals { + aws_lambda_functions_dir_path = "../../../../lambdas" + + clients_dir_path = "${path.module}/../../modules/clients" + + config_clients = merge([ + for filename in fileset(local.clients_dir_path, "*.json") : { + (replace(filename, ".json", "")) = jsondecode(file("${local.clients_dir_path}/${filename}")) + } + ]...) + + # When deploying mock clients, replace sentinel placeholder values with the mock webhook URL and API key. + # Only used for S3 object content — must not be used as a for_each source (contains apply-time values). + enriched_mock_config_clients = var.deploy_mock_clients ? { + for client_id, client in local.config_clients : + client_id => merge(client, { + targets = [ + for target in try(client.targets, []) : + merge(target, { + invocationEndpoint = "https://${local.callbacks.mock_webhook_alb_dns_name}/${target.targetId}" + apiKey = merge(target.apiKey, { headerValue = local.callbacks.mock_webhook_api_key }) + delivery = merge(try(target.delivery, {}), { + mtls = merge(try(target.delivery.mtls, {}), { + certPinning = merge(try(target.delivery.mtls.certPinning, {}), try(target.delivery.mtls.certPinning.enabled, false) ? { + spkiHash = local.callbacks.mock_server_spki_hash + } : {}) + }) + }) + }) + ] + }) + } : local.config_clients + + client_subscriptions = { + for client_id, data in local.config_clients : + client_id => { + for subscription in try(data.subscriptions, []) : + subscription.subscriptionId => { + subscription_id = subscription.subscriptionId + target_ids = try(subscription.targetIds, []) + } + } + } + + client_subscription_targets = { + for client_id, data in local.config_clients : + client_id => merge([ + for subscription in try(data.subscriptions, []) : { + for target_id in try(subscription.targetIds, []) : + "${subscription.subscriptionId}-${target_id}" => { + subscription_id = subscription.subscriptionId + target_id = target_id + } + } + ]...) + } +} diff --git a/infrastructure/terraform/components/callbacks-clients/locals_remote_state.tf b/infrastructure/terraform/components/callbacks-clients/locals_remote_state.tf new file mode 100644 index 00000000..8fbc867c --- /dev/null +++ b/infrastructure/terraform/components/callbacks-clients/locals_remote_state.tf @@ -0,0 +1,59 @@ +locals { + bootstrap = data.terraform_remote_state.bootstrap.outputs + acct = data.terraform_remote_state.acct.outputs + callbacks = data.terraform_remote_state.callbacks.outputs +} + +data "terraform_remote_state" "bootstrap" { + backend = "s3" + + config = { + bucket = local.terraform_state_bucket + + key = format( + "%s/%s/%s/%s/bootstrap.tfstate", + var.project, + var.aws_account_id, + "eu-west-2", + "bootstrap" + ) + + region = "eu-west-2" + } +} + +data "terraform_remote_state" "acct" { + backend = "s3" + + config = { + bucket = local.terraform_state_bucket + + key = format( + "%s/%s/%s/%s/acct.tfstate", + var.project, + var.aws_account_id, + "eu-west-2", + var.parent_acct_environment + ) + + region = "eu-west-2" + } +} + +data "terraform_remote_state" "callbacks" { + backend = "s3" + + config = { + bucket = local.terraform_state_bucket + + key = format( + "%s/%s/%s/%s/callbacks.tfstate", + var.project, + var.aws_account_id, + var.region, + var.environment + ) + + region = var.region + } +} diff --git a/infrastructure/terraform/components/callbacks-clients/locals_tfscaffold.tf b/infrastructure/terraform/components/callbacks-clients/locals_tfscaffold.tf new file mode 100644 index 00000000..b7cf3217 --- /dev/null +++ b/infrastructure/terraform/components/callbacks-clients/locals_tfscaffold.tf @@ -0,0 +1,44 @@ +locals { + terraform_state_bucket = format( + "%s-tfscaffold-%s-%s", + var.project, + var.aws_account_id, + var.region, + ) + + csi = replace( + format( + "%s-%s-%s", + var.project, + var.environment, + var.component, + ), + "_", + "", + ) + + # CSI for use in resources with a global namespace, i.e. S3 Buckets + csi_global = replace( + format( + "%s-%s-%s-%s-%s", + var.project, + var.aws_account_id, + var.region, + var.environment, + var.component, + ), + "_", + "", + ) + + default_tags = merge( + var.default_tags, + { + Project = var.project + Environment = var.environment + Component = var.component + Group = var.group + Name = local.csi + }, + ) +} diff --git a/infrastructure/terraform/components/callbacks-clients/module_client_delivery.tf b/infrastructure/terraform/components/callbacks-clients/module_client_delivery.tf new file mode 100644 index 00000000..823c0912 --- /dev/null +++ b/infrastructure/terraform/components/callbacks-clients/module_client_delivery.tf @@ -0,0 +1,48 @@ +module "client_delivery" { + source = "../../modules/client-delivery" + for_each = local.config_clients + + project = var.project + aws_account_id = var.aws_account_id + region = var.region + component = var.component + environment = var.environment + group = var.group + + client_id = each.key + client_bus_name = local.callbacks.event_bus_name + kms_key_arn = local.callbacks.kms_key_arn + + subscriptions = local.client_subscriptions[each.key] + subscription_targets = local.client_subscription_targets[each.key] + + client_config_bucket = local.callbacks.client_config_bucket + client_config_bucket_arn = local.callbacks.client_config_bucket_arn + + applications_map_parameter_name = local.callbacks.applications_map_parameter_name + + lambda_s3_bucket = local.callbacks.lambda_s3_bucket + lambda_code_base_path = local.aws_lambda_functions_dir_path + + force_lambda_code_deploy = var.force_lambda_code_deploy + log_level = var.log_level + log_retention_in_days = var.log_retention_in_days + enable_xray_tracing = var.enable_xray_tracing + + log_destination_arn = local.callbacks.log_destination_arn + log_subscription_role_arn = local.callbacks.log_subscription_role_arn + + elasticache_endpoint = local.callbacks.elasticache_endpoint + elasticache_cache_name = local.callbacks.elasticache_cache_name + elasticache_iam_username = local.callbacks.elasticache_iam_username + + mtls_cert_secret_arn = local.callbacks.mtls_cert_secret_arn + mtls_test_cert_s3_bucket = local.callbacks.mtls_test_cert_s3_bucket + mtls_test_cert_s3_key = local.callbacks.mtls_test_cert_s3_key # gitleaks:allow + mtls_test_ca_s3_key = local.callbacks.mtls_test_ca_s3_key # gitleaks:allow + + token_bucket_burst_capacity = var.token_bucket_burst_capacity + + vpc_subnet_ids = local.callbacks.vpc_subnet_ids + lambda_security_group_id = local.callbacks.lambda_security_group_id +} diff --git a/infrastructure/terraform/components/callbacks-clients/outputs.tf b/infrastructure/terraform/components/callbacks-clients/outputs.tf new file mode 100644 index 00000000..c443add1 --- /dev/null +++ b/infrastructure/terraform/components/callbacks-clients/outputs.tf @@ -0,0 +1,15 @@ +## +# Deployment details +## + +output "deployment" { + description = "Deployment details used for post-deployment scripts" + value = { + aws_region = var.region + aws_account_id = var.aws_account_id + project = var.project + environment = var.environment + group = var.group + component = var.component + } +} diff --git a/infrastructure/terraform/components/callbacks-clients/pre.sh b/infrastructure/terraform/components/callbacks-clients/pre.sh new file mode 100644 index 00000000..cd2e8a22 --- /dev/null +++ b/infrastructure/terraform/components/callbacks-clients/pre.sh @@ -0,0 +1,40 @@ +# This script is run before the Terraform apply command. +# It syncs client config from S3, copies mock/perf fixtures if needed, and builds lambda workspaces. + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=_paths.sh +source "${script_dir}/_paths.sh" + +# Resolve deploy_mock_clients and deploy_perf_runner from tfvars; base_path/group/region/environment are in scope from terraform.sh +deploy_mock_clients="false" +deploy_perf_runner="false" +for _tfvar_file in \ + "${base_path}/etc/group_${group}.tfvars" \ + "${base_path}/etc/env_${region}_${environment}.tfvars"; do + if [ -f "${_tfvar_file}" ]; then + _val=$(grep -E '^\s*deploy_mock_clients\s*=' "${_tfvar_file}" | tail -1 | sed 's/.*=\s*//;s/\s*$//') + [ -n "${_val}" ] && deploy_mock_clients="${_val}" + _val=$(grep -E '^\s*deploy_perf_runner\s*=' "${_tfvar_file}" | tail -1 | sed 's/.*=\s*//;s/\s*$//') + [ -n "${_val}" ] && deploy_perf_runner="${_val}" + fi +done +echo "deploy_mock_clients resolved to: ${deploy_mock_clients}" +echo "deploy_perf_runner resolved to: ${deploy_perf_runner}" + +pnpm install --frozen-lockfile + +pnpm run generate-dependencies + +"${script_dir}/sync-client-config.sh" + +if [ "${deploy_mock_clients}" == "true" ]; then + cp "${bounded_context_root}/tests/integration/fixtures/subscriptions/"*.json "${clients_dir}/" + echo "Copied mock client subscription config fixtures into clients dir" +fi + +if [ "${deploy_perf_runner}" == "true" ]; then + cp "${bounded_context_root}/tests/performance/fixtures/subscriptions/"*.json "${clients_dir}/" + echo "Copied perf client subscription config fixtures into clients dir" +fi + +pnpm run --recursive --if-present lambda-build diff --git a/infrastructure/terraform/components/callbacks-clients/provider_aws.tf b/infrastructure/terraform/components/callbacks-clients/provider_aws.tf new file mode 100644 index 00000000..c3ed73bb --- /dev/null +++ b/infrastructure/terraform/components/callbacks-clients/provider_aws.tf @@ -0,0 +1,11 @@ +provider "aws" { + region = var.region + + allowed_account_ids = [ + var.aws_account_id, + ] + + default_tags { + tags = local.default_tags + } +} diff --git a/infrastructure/terraform/components/callbacks-clients/s3_object_client_config.tf b/infrastructure/terraform/components/callbacks-clients/s3_object_client_config.tf new file mode 100644 index 00000000..aa7de6c3 --- /dev/null +++ b/infrastructure/terraform/components/callbacks-clients/s3_object_client_config.tf @@ -0,0 +1,12 @@ +resource "aws_s3_object" "mock_client_config" { + for_each = var.deploy_mock_clients ? toset(keys(local.config_clients)) : toset([]) + + bucket = local.callbacks.client_config_bucket + key = "client_subscriptions/${local.config_clients[each.key].clientId}.json" + content = jsonencode(local.enriched_mock_config_clients[each.key]) + + kms_key_id = local.callbacks.kms_key_arn + server_side_encryption = "aws:kms" + + content_type = "application/json" +} diff --git a/infrastructure/terraform/components/callbacks/ssm_parameter_applications_map.tf b/infrastructure/terraform/components/callbacks-clients/ssm_parameter_applications_map.tf similarity index 83% rename from infrastructure/terraform/components/callbacks/ssm_parameter_applications_map.tf rename to infrastructure/terraform/components/callbacks-clients/ssm_parameter_applications_map.tf index 567647d1..60ba72f3 100644 --- a/infrastructure/terraform/components/callbacks/ssm_parameter_applications_map.tf +++ b/infrastructure/terraform/components/callbacks-clients/ssm_parameter_applications_map.tf @@ -5,9 +5,9 @@ resource "random_password" "mock_application_id" { } resource "aws_ssm_parameter" "applications_map" { - name = local.applications_map_parameter_name + name = local.callbacks.applications_map_parameter_name type = "SecureString" - key_id = module.kms.key_arn + key_id = local.callbacks.kms_key_arn value = var.deploy_mock_clients ? jsonencode({ for id in keys(local.config_clients) : local.config_clients[id].clientId => random_password.mock_application_id[id].result diff --git a/infrastructure/terraform/components/callbacks-clients/sync-client-config.sh b/infrastructure/terraform/components/callbacks-clients/sync-client-config.sh new file mode 100644 index 00000000..2c2a3ecb --- /dev/null +++ b/infrastructure/terraform/components/callbacks-clients/sync-client-config.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash + +# Seeds local client subscription JSON files from S3 into modules/clients/ before Terraform runs. +# Terraform reads those files via fileset() to build local.config_clients. +# On first apply the bucket may not exist yet; this is handled gracefully. + +set -euo pipefail + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=_paths.sh +source "${script_dir}/_paths.sh" + +: "${ENVIRONMENT:?ENVIRONMENT must be set}" +: "${AWS_REGION:?AWS_REGION must be set}" +: "${AWS_ACCOUNT_ID:?AWS_ACCOUNT_ID must be set}" + +cd "${repo_root}" + +rm -f "${clients_dir}"/*.json + +bucket_name="nhs-${AWS_ACCOUNT_ID}-${AWS_REGION}-${ENVIRONMENT}-callbacks-subscription-config" + +s3_prefix="client_subscriptions/" + +echo "Seeding client configs from s3://${bucket_name}/${s3_prefix} for ${ENVIRONMENT}/${AWS_REGION}" + +if ! sync_output=$(aws s3 sync "s3://${bucket_name}/${s3_prefix}" "${clients_dir}/" \ + --region "${AWS_REGION}" \ + --exclude "*" \ + --include "*.json" \ + --only-show-errors 2>&1); then + if [[ "${sync_output}" == *"NoSuchBucket"* ]]; then + # Expected on first apply before Terraform creates the bucket. + echo "Client config bucket not found yet; skipping sync for first run" + else + echo "Failed to sync client config from S3" >&2 + echo "${sync_output}" >&2 + exit 1 + fi +fi + +# Ensure an empty directory produces a zero-length array rather than a literal "*.json" entry. +shopt -s nullglob +seeded_files=("${clients_dir}"/*.json) +seeded_count="${#seeded_files[@]}" +shopt -u nullglob + +echo "Seeded ${seeded_count} client config file(s)" diff --git a/infrastructure/terraform/components/callbacks-clients/variables.tf b/infrastructure/terraform/components/callbacks-clients/variables.tf new file mode 100644 index 00000000..31502b14 --- /dev/null +++ b/infrastructure/terraform/components/callbacks-clients/variables.tf @@ -0,0 +1,96 @@ +## +# Basic Required Variables for tfscaffold Components +## + +variable "project" { + type = string + description = "The name of the tfscaffold project" +} + +variable "environment" { + type = string + description = "The name of the tfscaffold environment" +} + +variable "aws_account_id" { + type = string + description = "The AWS Account ID (numeric)" +} + +variable "region" { + type = string + description = "The AWS Region" +} + +variable "group" { + type = string + description = "The group variables are being inherited from (often synonmous with account short-name)" +} + +## +# tfscaffold variables specific to this component +## + +variable "component" { + type = string + description = "The variable encapsulating the name of this component" + default = "callbacks-clients" +} + +variable "default_tags" { + type = map(string) + description = "A map of default tags to apply to all taggable resources within the component" + default = {} +} + +variable "parent_acct_environment" { + type = string + description = "Name of the environment responsible for the acct resources used, affects things like DNS zone. Useful for named dev environments" + default = "main" +} + +## +# Variables specific to the component +## + +variable "deploy_mock_clients" { + type = bool + description = "Flag to deploy mock client subscription config for integration testing (test/dev environments only)" + default = false +} + +variable "deploy_perf_runner" { + type = bool + description = "Flag to deploy performance test client subscription fixtures" + default = false +} + +variable "token_bucket_burst_capacity" { + type = number + description = "Token bucket burst capacity used by the rate limiter" + default = 2250 +} + +variable "log_retention_in_days" { + type = number + description = "The retention period in days for the Cloudwatch Logs events to be retained, default of 0 is indefinite" + default = 0 +} + +variable "log_level" { + type = string + description = "The log level to be used in lambda functions within the component" + default = "INFO" +} + +variable "force_lambda_code_deploy" { + type = bool + description = "If the lambda package in s3 has the same commit id tag as the terraform build branch, the lambda will not update automatically. Set to True if making changes to Lambda code from on the same commit for example during development" + default = false +} + +variable "enable_xray_tracing" { + type = bool + description = "Enable AWS X-Ray active tracing for Lambda functions" + default = false +} diff --git a/infrastructure/terraform/components/callbacks-clients/versions.tf b/infrastructure/terraform/components/callbacks-clients/versions.tf new file mode 100644 index 00000000..55552749 --- /dev/null +++ b/infrastructure/terraform/components/callbacks-clients/versions.tf @@ -0,0 +1,14 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "6.13" + } + random = { + source = "hashicorp/random" + version = "~> 3.0" + } + } + + required_version = ">= 1.10.1" +} diff --git a/infrastructure/terraform/components/callbacks/README.md b/infrastructure/terraform/components/callbacks/README.md index e090abb9..c725fe42 100644 --- a/infrastructure/terraform/components/callbacks/README.md +++ b/infrastructure/terraform/components/callbacks/README.md @@ -45,13 +45,11 @@ | [s3\_enable\_force\_destroy](#input\_s3\_enable\_force\_destroy) | Whether to enable force destroy for the S3 buckets created in this module | `bool` | `false` | no | | [sqs\_inbound\_event\_max\_receive\_count](#input\_sqs\_inbound\_event\_max\_receive\_count) | n/a | `number` | `3` | no | | [sqs\_inbound\_event\_visibility\_timeout\_seconds](#input\_sqs\_inbound\_event\_visibility\_timeout\_seconds) | n/a | `number` | `60` | no | -| [token\_bucket\_burst\_capacity](#input\_token\_bucket\_burst\_capacity) | Token bucket burst capacity used by the rate limiter | `number` | `2250` | no | ## Modules | Name | Source | Version | |------|--------|---------| | [client\_config\_bucket](#module\_client\_config\_bucket) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/3.0.7/terraform-s3bucket.zip | n/a | -| [client\_delivery](#module\_client\_delivery) | ../../modules/client-delivery | n/a | | [client\_transform\_filter\_lambda](#module\_client\_transform\_filter\_lambda) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/3.0.7/terraform-lambda.zip | n/a | | [kms](#module\_kms) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/3.0.7/terraform-kms.zip | n/a | | [mock\_webhook\_lambda](#module\_mock\_webhook\_lambda) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/3.0.7/terraform-lambda.zip | n/a | @@ -62,8 +60,28 @@ | Name | Description | |------|-------------| +| [applications\_map\_parameter\_name](#output\_applications\_map\_parameter\_name) | SSM Parameter Store path for the clientId-to-applicationData map | +| [client\_config\_bucket](#output\_client\_config\_bucket) | S3 bucket name for client subscription configuration | +| [client\_config\_bucket\_arn](#output\_client\_config\_bucket\_arn) | S3 bucket ARN for client subscription configuration | | [deployment](#output\_deployment) | Deployment details used for post-deployment scripts | +| [elasticache\_cache\_name](#output\_elasticache\_cache\_name) | ElastiCache cache name for SigV4 token presigning | +| [elasticache\_endpoint](#output\_elasticache\_endpoint) | ElastiCache Serverless endpoint address | +| [elasticache\_iam\_username](#output\_elasticache\_iam\_username) | IAM username for ElastiCache authentication | +| [event\_bus\_name](#output\_event\_bus\_name) | EventBridge bus name for client subscription rules | +| [kms\_key\_arn](#output\_kms\_key\_arn) | KMS key ARN for encryption at rest | +| [lambda\_s3\_bucket](#output\_lambda\_s3\_bucket) | S3 bucket for Lambda function artefacts | +| [lambda\_security\_group\_id](#output\_lambda\_security\_group\_id) | Security group ID for per-client HTTPS Client Lambda functions | +| [log\_destination\_arn](#output\_log\_destination\_arn) | Firehose destination ARN for log forwarding | +| [log\_subscription\_role\_arn](#output\_log\_subscription\_role\_arn) | IAM role ARN for CloudWatch log subscription | +| [mock\_server\_spki\_hash](#output\_mock\_server\_spki\_hash) | Base64 SHA-256 SPKI hash of the mock server certificate | +| [mock\_webhook\_alb\_dns\_name](#output\_mock\_webhook\_alb\_dns\_name) | DNS name of the mock webhook ALB (dev/test only) | +| [mock\_webhook\_api\_key](#output\_mock\_webhook\_api\_key) | API key for the mock webhook endpoint (dev/test only) | | [mock\_webhook\_lambda\_log\_group\_name](#output\_mock\_webhook\_lambda\_log\_group\_name) | CloudWatch log group name for mock webhook lambda (for integration test queries) | +| [mtls\_cert\_secret\_arn](#output\_mtls\_cert\_secret\_arn) | Secrets Manager ARN for the shared mTLS client certificate | +| [mtls\_test\_ca\_s3\_key](#output\_mtls\_test\_ca\_s3\_key) | S3 key for dev CA certificate PEM bundle | +| [mtls\_test\_cert\_s3\_bucket](#output\_mtls\_test\_cert\_s3\_bucket) | S3 bucket for dev mTLS test certificates | +| [mtls\_test\_cert\_s3\_key](#output\_mtls\_test\_cert\_s3\_key) | S3 key for dev mTLS test certificate bundle | +| [vpc\_subnet\_ids](#output\_vpc\_subnet\_ids) | VPC subnet IDs for Lambda execution | diff --git a/infrastructure/terraform/components/callbacks/locals.tf b/infrastructure/terraform/components/callbacks/locals.tf index 68129a5b..ef474000 100644 --- a/infrastructure/terraform/components/callbacks/locals.tf +++ b/infrastructure/terraform/components/callbacks/locals.tf @@ -5,65 +5,10 @@ locals { root_domain_name = "${var.environment}.${local.acct.route53_zone_names["client-callbacks"]}" # e.g. [main|dev|abxy0].smsnudge.[dev|nonprod|prod].nhsnotify.national.nhs.uk root_domain_id = local.acct.route53_zone_ids["client-callbacks"] - clients_dir_path = "${path.module}/../../modules/clients" - - config_clients = merge([ - for filename in fileset(local.clients_dir_path, "*.json") : { - (replace(filename, ".json", "")) = jsondecode(file("${local.clients_dir_path}/${filename}")) - } - ]...) - # SPKI hash of the mock webhook server certificate for cert-pinning enrichment. # Computed via external data source because Terraform cannot SHA-256 hash raw binary (DER) data natively. mock_server_spki_hash = var.deploy_mock_clients ? data.external.mock_server_spki_hash[0].result.hash : "" - # When deploying mock clients, replace sentinel placeholder values with the mock webhook URL and API key. - # Only used for S3 object content — must not be used as a for_each source (contains apply-time values). - enriched_mock_config_clients = var.deploy_mock_clients ? { - for client_id, client in local.config_clients : - client_id => merge(client, { - targets = [ - for target in try(client.targets, []) : - merge(target, { - invocationEndpoint = "https://${aws_lb.mock_webhook_mtls[0].dns_name}/${target.targetId}" - apiKey = merge(target.apiKey, { headerValue = random_password.mock_webhook_api_key[0].result }) - delivery = merge(try(target.delivery, {}), { - mtls = merge(try(target.delivery.mtls, {}), { - certPinning = merge(try(target.delivery.mtls.certPinning, {}), try(target.delivery.mtls.certPinning.enabled, false) ? { - spkiHash = local.mock_server_spki_hash - } : {}) - }) - }) - }) - ] - }) - } : local.config_clients - - - client_subscriptions = { - for client_id, data in local.config_clients : - client_id => { - for subscription in try(data.subscriptions, []) : - subscription.subscriptionId => { - subscription_id = subscription.subscriptionId - target_ids = try(subscription.targetIds, []) - } - } - } - - client_subscription_targets = { - for client_id, data in local.config_clients : - client_id => merge([ - for subscription in try(data.subscriptions, []) : { - for target_id in try(subscription.targetIds, []) : - "${subscription.subscriptionId}-${target_id}" => { - subscription_id = subscription.subscriptionId - target_id = target_id - } - } - ]...) - } - applications_map_parameter_name = coalesce(var.applications_map_parameter_name, "/${var.project}/${var.environment}/${var.component}/applications-map") client_config_bucket_arn = "arn:aws:s3:::${var.project}-${var.aws_account_id}-${var.region}-${var.environment}-${var.component}-subscription-config" diff --git a/infrastructure/terraform/components/callbacks/module_client_delivery.tf b/infrastructure/terraform/components/callbacks/module_client_delivery.tf deleted file mode 100644 index 5122606e..00000000 --- a/infrastructure/terraform/components/callbacks/module_client_delivery.tf +++ /dev/null @@ -1,48 +0,0 @@ -module "client_delivery" { - source = "../../modules/client-delivery" - for_each = local.config_clients - - project = var.project - aws_account_id = var.aws_account_id - region = var.region - component = var.component - environment = var.environment - group = var.group - - client_id = each.key - client_bus_name = aws_cloudwatch_event_bus.main.name - kms_key_arn = module.kms.key_arn - - subscriptions = local.client_subscriptions[each.key] - subscription_targets = local.client_subscription_targets[each.key] - - client_config_bucket = module.client_config_bucket.bucket - client_config_bucket_arn = module.client_config_bucket.arn - - applications_map_parameter_name = local.applications_map_parameter_name - - lambda_s3_bucket = local.acct.s3_buckets["lambda_function_artefacts"]["id"] - lambda_code_base_path = local.aws_lambda_functions_dir_path - - force_lambda_code_deploy = var.force_lambda_code_deploy - log_level = var.log_level - log_retention_in_days = var.log_retention_in_days - enable_xray_tracing = var.enable_xray_tracing - - log_destination_arn = local.log_destination_arn - log_subscription_role_arn = local.acct.log_subscription_role_arn - - elasticache_endpoint = aws_elasticache_serverless_cache.delivery_state.endpoint[0].address - elasticache_cache_name = aws_elasticache_serverless_cache.delivery_state.name - elasticache_iam_username = "${var.project}-${var.environment}-${var.component}-elasticache-user" - - mtls_cert_secret_arn = var.mtls_cert_secret_arn - mtls_test_cert_s3_bucket = var.deploy_mock_clients ? module.mtls_test_certs_bucket[0].bucket : "" - mtls_test_cert_s3_key = local.mtls_test_cert_s3_key # gitleaks:allow - mtls_test_ca_s3_key = local.mtls_test_ca_s3_key # gitleaks:allow - - token_bucket_burst_capacity = var.token_bucket_burst_capacity - - vpc_subnet_ids = try(local.acct.private_subnets[local.bc_name], []) - lambda_security_group_id = aws_security_group.https_client_lambda.id -} diff --git a/infrastructure/terraform/components/callbacks/module_perf_runner_lambda.tf b/infrastructure/terraform/components/callbacks/module_perf_runner_lambda.tf index 7a77c40c..6fee9bf7 100644 --- a/infrastructure/terraform/components/callbacks/module_perf_runner_lambda.tf +++ b/infrastructure/terraform/components/callbacks/module_perf_runner_lambda.tf @@ -39,7 +39,8 @@ module "perf_runner_lambda" { ENVIRONMENT = var.environment INBOUND_QUEUE_URL = module.sqs_inbound_event.sqs_queue_url TRANSFORM_FILTER_LOG_GROUP = module.client_transform_filter_lambda.cloudwatch_log_group_name - DELIVERY_LOG_GROUP_PREFIX = "/aws/lambda/${local.csi}-https-client-" + DELIVERY_LOG_GROUP_PREFIX = "/aws/lambda/${var.project}-${var.environment}-callbacks-clients-https-client-" + DELIVERY_QUEUE_URL_PREFIX = "https://sqs.${var.region}.amazonaws.com/${var.aws_account_id}/${var.project}-${var.environment}-callbacks-clients-" MOCK_WEBHOOK_LOG_GROUP = var.deploy_mock_clients ? module.mock_webhook_lambda[0].cloudwatch_log_group_name : "" ELASTICACHE_ENDPOINT = aws_elasticache_serverless_cache.delivery_state.endpoint[0].address ELASTICACHE_CACHE_NAME = aws_elasticache_serverless_cache.delivery_state.name @@ -94,8 +95,8 @@ data "aws_iam_policy_document" "perf_runner_lambda" { resources = [ module.sqs_inbound_event.sqs_queue_arn, "${module.sqs_inbound_event.sqs_queue_arn}-dlq", - "arn:aws:sqs:${var.region}:${var.aws_account_id}:${local.csi}-*-delivery-queue", - "arn:aws:sqs:${var.region}:${var.aws_account_id}:${local.csi}-*-delivery-dlq-queue", + "arn:aws:sqs:${var.region}:${var.aws_account_id}:${var.project}-${var.environment}-callbacks-clients-*-delivery-queue", + "arn:aws:sqs:${var.region}:${var.aws_account_id}:${var.project}-${var.environment}-callbacks-clients-*-delivery-dlq-queue", ] } @@ -111,7 +112,7 @@ data "aws_iam_policy_document" "perf_runner_lambda" { resources = concat( [ "arn:aws:logs:${var.region}:${var.aws_account_id}:log-group:${module.client_transform_filter_lambda.cloudwatch_log_group_name}:*", - "arn:aws:logs:${var.region}:${var.aws_account_id}:log-group:/aws/lambda/${local.csi}-https-client-*", + "arn:aws:logs:${var.region}:${var.aws_account_id}:log-group:/aws/lambda/${var.project}-${var.environment}-callbacks-clients-https-client-*", ], var.deploy_mock_clients ? [ "arn:aws:logs:${var.region}:${var.aws_account_id}:log-group:${module.mock_webhook_lambda[0].cloudwatch_log_group_name}:*", diff --git a/infrastructure/terraform/components/callbacks/outputs.tf b/infrastructure/terraform/components/callbacks/outputs.tf index 1ca00df8..9d2d8a4c 100644 --- a/infrastructure/terraform/components/callbacks/outputs.tf +++ b/infrastructure/terraform/components/callbacks/outputs.tf @@ -22,3 +22,108 @@ output "mock_webhook_lambda_log_group_name" { description = "CloudWatch log group name for mock webhook lambda (for integration test queries)" value = var.deploy_mock_clients ? module.mock_webhook_lambda[0].cloudwatch_log_group_name : null } + +## +# Shared outputs consumed by the callbacks-clients component via terraform_remote_state. +## + +output "kms_key_arn" { + description = "KMS key ARN for encryption at rest" + value = module.kms.key_arn +} + +output "event_bus_name" { + description = "EventBridge bus name for client subscription rules" + value = aws_cloudwatch_event_bus.main.name +} + +output "client_config_bucket" { + description = "S3 bucket name for client subscription configuration" + value = module.client_config_bucket.bucket +} + +output "client_config_bucket_arn" { + description = "S3 bucket ARN for client subscription configuration" + value = module.client_config_bucket.arn +} + +output "applications_map_parameter_name" { + description = "SSM Parameter Store path for the clientId-to-applicationData map" + value = local.applications_map_parameter_name +} + +output "lambda_s3_bucket" { + description = "S3 bucket for Lambda function artefacts" + value = local.acct.s3_buckets["lambda_function_artefacts"]["id"] +} + +output "log_destination_arn" { + description = "Firehose destination ARN for log forwarding" + value = local.log_destination_arn +} + +output "log_subscription_role_arn" { + description = "IAM role ARN for CloudWatch log subscription" + value = local.acct.log_subscription_role_arn +} + +output "elasticache_endpoint" { + description = "ElastiCache Serverless endpoint address" + value = aws_elasticache_serverless_cache.delivery_state.endpoint[0].address +} + +output "elasticache_cache_name" { + description = "ElastiCache cache name for SigV4 token presigning" + value = aws_elasticache_serverless_cache.delivery_state.name +} + +output "elasticache_iam_username" { + description = "IAM username for ElastiCache authentication" + value = "${var.project}-${var.environment}-${var.component}-elasticache-user" +} + +output "lambda_security_group_id" { + description = "Security group ID for per-client HTTPS Client Lambda functions" + value = aws_security_group.https_client_lambda.id +} + +output "vpc_subnet_ids" { + description = "VPC subnet IDs for Lambda execution" + value = try(local.acct.private_subnets[local.bc_name], []) +} + +output "mtls_cert_secret_arn" { + description = "Secrets Manager ARN for the shared mTLS client certificate" + value = var.mtls_cert_secret_arn +} + +output "mtls_test_cert_s3_bucket" { + description = "S3 bucket for dev mTLS test certificates" + value = var.deploy_mock_clients ? module.mtls_test_certs_bucket[0].bucket : "" +} + +output "mtls_test_cert_s3_key" { + description = "S3 key for dev mTLS test certificate bundle" + value = local.mtls_test_cert_s3_key +} + +output "mtls_test_ca_s3_key" { + description = "S3 key for dev CA certificate PEM bundle" + value = local.mtls_test_ca_s3_key +} + +output "mock_webhook_alb_dns_name" { + description = "DNS name of the mock webhook ALB (dev/test only)" + value = var.deploy_mock_clients ? aws_lb.mock_webhook_mtls[0].dns_name : "" +} + +output "mock_webhook_api_key" { + description = "API key for the mock webhook endpoint (dev/test only)" + value = var.deploy_mock_clients ? random_password.mock_webhook_api_key[0].result : "" + sensitive = true +} + +output "mock_server_spki_hash" { + description = "Base64 SHA-256 SPKI hash of the mock server certificate" + value = local.mock_server_spki_hash +} diff --git a/infrastructure/terraform/components/callbacks/pre.sh b/infrastructure/terraform/components/callbacks/pre.sh index cac3b745..64cefdfa 100755 --- a/infrastructure/terraform/components/callbacks/pre.sh +++ b/infrastructure/terraform/components/callbacks/pre.sh @@ -1,41 +1,12 @@ # This script is run before the Terraform apply command. -# It ensures dependencies are installed, generates local client config files -# for terraform from S3-held subscriptions, and builds lambda workspaces. +# It ensures dependencies are installed and builds lambda workspaces. script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" # shellcheck source=_paths.sh source "${script_dir}/_paths.sh" -# Resolve deploy_mock_clients and deploy_perf_runner from tfvars; base_path/group/region/environment are in scope from terraform.sh -deploy_mock_clients="false" -deploy_perf_runner="false" -for _tfvar_file in \ - "${base_path}/etc/group_${group}.tfvars" \ - "${base_path}/etc/env_${region}_${environment}.tfvars"; do - if [ -f "${_tfvar_file}" ]; then - _val=$(grep -E '^\s*deploy_mock_clients\s*=' "${_tfvar_file}" | tail -1 | sed 's/.*=\s*//;s/\s*$//') - [ -n "${_val}" ] && deploy_mock_clients="${_val}" - _val=$(grep -E '^\s*deploy_perf_runner\s*=' "${_tfvar_file}" | tail -1 | sed 's/.*=\s*//;s/\s*$//') - [ -n "${_val}" ] && deploy_perf_runner="${_val}" - fi -done -echo "deploy_mock_clients resolved to: ${deploy_mock_clients}" -echo "deploy_perf_runner resolved to: ${deploy_perf_runner}" - pnpm install --frozen-lockfile pnpm run generate-dependencies -"${script_dir}/sync-client-config.sh" - -if [ "${deploy_mock_clients}" == "true" ]; then - cp "${bounded_context_root}/tests/integration/fixtures/subscriptions/"*.json "${clients_dir}/" - echo "Copied mock client subscription config fixtures into clients dir" -fi - -if [ "${deploy_perf_runner}" == "true" ]; then - cp "${bounded_context_root}/tests/performance/fixtures/subscriptions/"*.json "${clients_dir}/" - echo "Copied perf client subscription config fixtures into clients dir" -fi - pnpm run --recursive --if-present lambda-build diff --git a/infrastructure/terraform/components/callbacks/s3_bucket_client_config.tf b/infrastructure/terraform/components/callbacks/s3_bucket_client_config.tf index 9943affd..f3ab9b1e 100644 --- a/infrastructure/terraform/components/callbacks/s3_bucket_client_config.tf +++ b/infrastructure/terraform/components/callbacks/s3_bucket_client_config.tf @@ -1,16 +1,3 @@ -resource "aws_s3_object" "mock_client_config" { - for_each = var.deploy_mock_clients ? toset(keys(local.config_clients)) : toset([]) - - bucket = module.client_config_bucket.id - key = "client_subscriptions/${local.config_clients[each.key].clientId}.json" - content = jsonencode(local.enriched_mock_config_clients[each.key]) - - kms_key_id = module.kms.key_arn - server_side_encryption = "aws:kms" - - content_type = "application/json" -} - module "client_config_bucket" { source = "https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/3.0.7/terraform-s3bucket.zip" diff --git a/infrastructure/terraform/components/callbacks/variables.tf b/infrastructure/terraform/components/callbacks/variables.tf index aef32373..9c71492d 100644 --- a/infrastructure/terraform/components/callbacks/variables.tf +++ b/infrastructure/terraform/components/callbacks/variables.tf @@ -195,9 +195,3 @@ variable "elasticache_data_storage_maximum_gb" { description = "Maximum data storage in GB for the ElastiCache Serverless delivery state cache" default = 1 } - -variable "token_bucket_burst_capacity" { - type = number - description = "Token bucket burst capacity used by the rate limiter" - default = 2250 -} diff --git a/lambdas/perf-runner-lambda/src/__tests__/index.test.ts b/lambdas/perf-runner-lambda/src/__tests__/index.test.ts index 3c33bfd6..dbabd465 100644 --- a/lambdas/perf-runner-lambda/src/__tests__/index.test.ts +++ b/lambdas/perf-runner-lambda/src/__tests__/index.test.ts @@ -42,7 +42,9 @@ beforeEach(() => { process.env.TRANSFORM_FILTER_LOG_GROUP = "/aws/lambda/nhs-dev-callbacks-client-transform-filter"; process.env.DELIVERY_LOG_GROUP_PREFIX = - "/aws/lambda/nhs-dev-callbacks-https-client-"; + "/aws/lambda/nhs-dev-callbacks-clients-https-client-"; + process.env.DELIVERY_QUEUE_URL_PREFIX = + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-clients-"; process.env.MOCK_WEBHOOK_LOG_GROUP = "/aws/lambda/nhs-dev-callbacks-mock-webhook"; process.env.ELASTICACHE_ENDPOINT = "cache.example.invalid"; @@ -60,7 +62,10 @@ describe("handler", () => { expect.objectContaining({ queueUrl: "https://sqs.example.invalid/queue", logGroupName: "/aws/lambda/nhs-dev-callbacks-client-transform-filter", - deliveryLogGroupPrefix: "/aws/lambda/nhs-dev-callbacks-https-client-", + deliveryLogGroupPrefix: + "/aws/lambda/nhs-dev-callbacks-clients-https-client-", + deliveryQueueUrlPrefix: + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-clients-", mockWebhookLogGroup: "/aws/lambda/nhs-dev-callbacks-mock-webhook", }), DEFAULT_SCENARIO, diff --git a/lambdas/perf-runner-lambda/src/__tests__/purge.test.ts b/lambdas/perf-runner-lambda/src/__tests__/purge.test.ts index 60347ef9..5cad33e4 100644 --- a/lambdas/perf-runner-lambda/src/__tests__/purge.test.ts +++ b/lambdas/perf-runner-lambda/src/__tests__/purge.test.ts @@ -24,17 +24,24 @@ const scenario: Scenario = { const inboundQueueUrl = "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-inbound-event-queue"; +const deliveryQueueUrlPrefix = + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-clients-"; + describe("deriveQueueUrls", () => { - it("derives all queue URLs from the inbound queue URL and scenario", () => { - const urls = deriveQueueUrls(inboundQueueUrl, scenario); + it("derives all queue URLs from the inbound queue URL, scenario and delivery prefix", () => { + const urls = deriveQueueUrls( + inboundQueueUrl, + scenario, + deliveryQueueUrlPrefix, + ); expect(urls).toEqual([ "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-inbound-event-queue", "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-inbound-event-dlq-queue", - "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-perf-client-1-delivery-queue", - "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-perf-client-1-delivery-dlq-queue", - "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-perf-client-2-delivery-queue", - "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-perf-client-2-delivery-dlq-queue", + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-clients-perf-client-1-delivery-queue", + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-clients-perf-client-1-delivery-dlq-queue", + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-clients-perf-client-2-delivery-queue", + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-clients-perf-client-2-delivery-dlq-queue", ]); }); @@ -57,13 +64,30 @@ describe("deriveQueueUrls", () => { ], }; - const urls = deriveQueueUrls(inboundQueueUrl, duplicateScenario); + const urls = deriveQueueUrls( + inboundQueueUrl, + duplicateScenario, + deliveryQueueUrlPrefix, + ); + + expect(urls).toEqual([ + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-inbound-event-queue", + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-inbound-event-dlq-queue", + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-clients-perf-client-1-delivery-queue", + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-clients-perf-client-1-delivery-dlq-queue", + ]); + }); + + it("falls back to inbound URL prefix when no delivery prefix is given", () => { + const urls = deriveQueueUrls(inboundQueueUrl, scenario); expect(urls).toEqual([ "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-inbound-event-queue", "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-inbound-event-dlq-queue", "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-perf-client-1-delivery-queue", "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-perf-client-1-delivery-dlq-queue", + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-perf-client-2-delivery-queue", + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-perf-client-2-delivery-dlq-queue", ]); }); }); diff --git a/lambdas/perf-runner-lambda/src/__tests__/runner.test.ts b/lambdas/perf-runner-lambda/src/__tests__/runner.test.ts index 46a0928d..5551f7dc 100644 --- a/lambdas/perf-runner-lambda/src/__tests__/runner.test.ts +++ b/lambdas/perf-runner-lambda/src/__tests__/runner.test.ts @@ -98,7 +98,9 @@ const deps: RunnerDeps = { cloudWatchClient: {} as CloudWatchLogsClient, queueUrl: "https://sqs.example.invalid/queue", logGroupName: "/aws/lambda/nhs-dev-callbacks-client-transform-filter", - deliveryLogGroupPrefix: "/aws/lambda/nhs-dev-callbacks-https-client-", + deliveryLogGroupPrefix: "/aws/lambda/nhs-dev-callbacks-clients-https-client-", + deliveryQueueUrlPrefix: + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-clients-", }; beforeEach(() => { @@ -353,8 +355,8 @@ describe("runPerformanceTest", () => { expect(mockQueryDeliveryMetricsSnapshot).toHaveBeenCalledWith( deps.cloudWatchClient, expect.arrayContaining([ - "/aws/lambda/nhs-dev-callbacks-https-client-perf-client-1", - "/aws/lambda/nhs-dev-callbacks-https-client-perf-client-2", + "/aws/lambda/nhs-dev-callbacks-clients-https-client-perf-client-1", + "/aws/lambda/nhs-dev-callbacks-clients-https-client-perf-client-2", ]), expect.any(Number), expect.any(Number), @@ -480,13 +482,13 @@ describe("runPerformanceTest", () => { expect(mockQueryPerClientRateTimeline).toHaveBeenCalledTimes(2); expect(mockQueryPerClientRateTimeline).toHaveBeenCalledWith( deps.cloudWatchClient, - "/aws/lambda/nhs-dev-callbacks-https-client-perf-client-1", + "/aws/lambda/nhs-dev-callbacks-clients-https-client-perf-client-1", expect.any(Number), expect.any(Number), ); expect(mockQueryPerClientRateTimeline).toHaveBeenCalledWith( deps.cloudWatchClient, - "/aws/lambda/nhs-dev-callbacks-https-client-perf-client-2", + "/aws/lambda/nhs-dev-callbacks-clients-https-client-perf-client-2", expect.any(Number), expect.any(Number), ); @@ -551,7 +553,11 @@ describe("runPerformanceTest", () => { await runPerformanceTest(deps, scenario, "test-purge", immediateSleep); - expect(mockDeriveQueueUrls).toHaveBeenCalledWith(deps.queueUrl, scenario); + expect(mockDeriveQueueUrls).toHaveBeenCalledWith( + deps.queueUrl, + scenario, + deps.deliveryQueueUrlPrefix, + ); expect(mockPurgeQueues).toHaveBeenCalledTimes(2); }); diff --git a/lambdas/perf-runner-lambda/src/index.ts b/lambdas/perf-runner-lambda/src/index.ts index 5974627b..f201f1ea 100644 --- a/lambdas/perf-runner-lambda/src/index.ts +++ b/lambdas/perf-runner-lambda/src/index.ts @@ -20,6 +20,7 @@ export async function handler( const queueUrl = process.env.INBOUND_QUEUE_URL; const logGroupName = process.env.TRANSFORM_FILTER_LOG_GROUP; const deliveryLogGroupPrefix = process.env.DELIVERY_LOG_GROUP_PREFIX; + const deliveryQueueUrlPrefix = process.env.DELIVERY_QUEUE_URL_PREFIX; const mockWebhookLogGroup = process.env.MOCK_WEBHOOK_LOG_GROUP; const elasticacheEndpoint = process.env.ELASTICACHE_ENDPOINT; const elasticacheCacheName = process.env.ELASTICACHE_CACHE_NAME; @@ -58,6 +59,7 @@ export async function handler( queueUrl, logGroupName, deliveryLogGroupPrefix, + deliveryQueueUrlPrefix, mockWebhookLogGroup, }, scenario, diff --git a/lambdas/perf-runner-lambda/src/purge.ts b/lambdas/perf-runner-lambda/src/purge.ts index 5743e9d2..bd51097b 100644 --- a/lambdas/perf-runner-lambda/src/purge.ts +++ b/lambdas/perf-runner-lambda/src/purge.ts @@ -4,17 +4,19 @@ import type { Scenario } from "types"; export function deriveQueueUrls( inboundQueueUrl: string, scenario: Scenario, + deliveryQueueUrlPrefix?: string, ): string[] { // eslint-disable-next-line sonarjs/null-dereference -- String.replace always returns a string - const baseUrl = inboundQueueUrl.replace(/inbound-event-queue$/, ""); + const inboundBaseUrl = inboundQueueUrl.replace(/inbound-event-queue$/, ""); + const deliveryBaseUrl = deliveryQueueUrlPrefix ?? inboundBaseUrl; const clientIds = [...new Set(scenario.eventMix.map((e) => e.clientId))]; return [ inboundQueueUrl, - `${baseUrl}inbound-event-dlq-queue`, + `${inboundBaseUrl}inbound-event-dlq-queue`, ...clientIds.flatMap((id) => [ - `${baseUrl}${id}-delivery-queue`, - `${baseUrl}${id}-delivery-dlq-queue`, + `${deliveryBaseUrl}${id}-delivery-queue`, + `${deliveryBaseUrl}${id}-delivery-dlq-queue`, ]), ]; } diff --git a/lambdas/perf-runner-lambda/src/runner.ts b/lambdas/perf-runner-lambda/src/runner.ts index 321abc45..0ae6a81c 100644 --- a/lambdas/perf-runner-lambda/src/runner.ts +++ b/lambdas/perf-runner-lambda/src/runner.ts @@ -107,7 +107,11 @@ export async function runPerformanceTest( const testStartMs = Date.now(); - const queueUrls = deriveQueueUrls(deps.queueUrl, scenario); + const queueUrls = deriveQueueUrls( + deps.queueUrl, + scenario, + deps.deliveryQueueUrlPrefix, + ); await purgeQueues(deps.sqsClient, queueUrls); if (elastiCacheDeps) { await flushElastiCache(elastiCacheDeps); diff --git a/lambdas/perf-runner-lambda/src/types.ts b/lambdas/perf-runner-lambda/src/types.ts index 24df2a50..b4ece4ae 100644 --- a/lambdas/perf-runner-lambda/src/types.ts +++ b/lambdas/perf-runner-lambda/src/types.ts @@ -108,6 +108,7 @@ export type RunnerDeps = { queueUrl: string; logGroupName: string; deliveryLogGroupPrefix?: string; + deliveryQueueUrlPrefix?: string; mockWebhookLogGroup?: string; }; diff --git a/scripts/tests/integration-debug.sh b/scripts/tests/integration-debug.sh index a4ebbd63..34f1e8d1 100755 --- a/scripts/tests/integration-debug.sh +++ b/scripts/tests/integration-debug.sh @@ -59,6 +59,7 @@ fi ACCOUNT_ID="$(aws sts get-caller-identity --profile "$AWS_PROFILE" --query Account --output text)" PREFIX="nhs-${ENVIRONMENT}-callbacks" +CLIENT_PREFIX="nhs-${ENVIRONMENT}-callbacks-clients" PIPE_NAME="${PREFIX}-main" print_section() { @@ -97,8 +98,8 @@ show_queue_counts() { action_queue_status() { require_client_id - show_queue_counts "Client Delivery Queue - Message Counts" "${PREFIX}-${CLIENT_ID}-delivery-queue" - show_queue_counts "Client Delivery DLQ - Message Counts" "${PREFIX}-${CLIENT_ID}-delivery-dlq-queue" + show_queue_counts "Client Delivery Queue - Message Counts" "${CLIENT_PREFIX}-${CLIENT_ID}-delivery-queue" + show_queue_counts "Client Delivery DLQ - Message Counts" "${CLIENT_PREFIX}-${CLIENT_ID}-delivery-dlq-queue" show_queue_counts "Inbound Event Queue - Message Counts" "${PREFIX}-inbound-event-queue" show_queue_counts "Inbound Event DLQ - Message Counts" "${PREFIX}-inbound-event-dlq" } @@ -126,8 +127,8 @@ peek_queue_message() { action_queue_peek() { require_client_id - peek_queue_message "Client Delivery Queue - Message Peek" "${PREFIX}-${CLIENT_ID}-delivery-queue" - peek_queue_message "Client Delivery DLQ - Message Peek" "${PREFIX}-${CLIENT_ID}-delivery-dlq-queue" + peek_queue_message "Client Delivery Queue - Message Peek" "${CLIENT_PREFIX}-${CLIENT_ID}-delivery-queue" + peek_queue_message "Client Delivery DLQ - Message Peek" "${CLIENT_PREFIX}-${CLIENT_ID}-delivery-dlq-queue" peek_queue_message "Inbound Event Queue - Message Peek" "${PREFIX}-inbound-event-queue" peek_queue_message "Inbound Event DLQ - Message Peek" "${PREFIX}-inbound-event-dlq" } @@ -162,7 +163,7 @@ action_tail_https_client() { print_section "HTTPS Client Lambda Logs" aws logs tail \ - "/aws/lambda/${PREFIX}-https-client-${CLIENT_ID}" \ + "/aws/lambda/${CLIENT_PREFIX}-https-client-${CLIENT_ID}" \ --region "$REGION" \ --profile "$AWS_PROFILE" \ --since 30m \ From 3d7a4026ff2e05fb2f2f2ef8886863ad114420a6 Mon Sep 17 00:00:00 2001 From: Rhys Cox Date: Wed, 29 Apr 2026 10:12:22 +0100 Subject: [PATCH 7/9] CCM-16073 - Fixed resource name lengths --- .github/workflows/cicd-1-pull-request.yaml | 4 +- .github/workflows/pr_closed.yml | 2 +- .github/workflows/pr_destroy_dynamic_env.yml | 4 +- .github/workflows/release_created.yml | 2 +- .../components/callbacks-clients/README.md | 32 ++--------- .../callbacks/module_perf_runner_lambda.tf | 10 ++-- .../terraform/components/callbacks/outputs.tf | 2 +- .../{callbacks-clients => cbc}/.tool-versions | 0 .../terraform/components/cbc/README.md | 43 ++++++++++++++ .../{callbacks-clients => cbc}/_paths.sh | 0 .../terraform/components/cbc/locals.tf | 57 +++++++++++++++++++ .../locals_remote_state.tf | 0 .../locals_tfscaffold.tf | 0 .../module_client_delivery.tf | 0 .../{callbacks-clients => cbc}/outputs.tf | 0 .../{callbacks-clients => cbc}/pre.sh | 0 .../provider_aws.tf | 0 .../s3_object_client_config.tf | 0 .../ssm_parameter_applications_map.tf | 0 .../sync-client-config.sh | 0 .../{callbacks-clients => cbc}/variables.tf | 2 +- .../{callbacks-clients => cbc}/versions.tf | 0 .../src/__tests__/index.test.ts | 9 ++- .../src/__tests__/purge.test.ts | 14 ++--- .../src/__tests__/runner.test.ts | 12 ++-- scripts/tests/integration-debug.sh | 2 +- 26 files changed, 135 insertions(+), 60 deletions(-) rename infrastructure/terraform/components/{callbacks-clients => cbc}/.tool-versions (100%) create mode 100644 infrastructure/terraform/components/cbc/README.md rename infrastructure/terraform/components/{callbacks-clients => cbc}/_paths.sh (100%) create mode 100644 infrastructure/terraform/components/cbc/locals.tf rename infrastructure/terraform/components/{callbacks-clients => cbc}/locals_remote_state.tf (100%) rename infrastructure/terraform/components/{callbacks-clients => cbc}/locals_tfscaffold.tf (100%) rename infrastructure/terraform/components/{callbacks-clients => cbc}/module_client_delivery.tf (100%) rename infrastructure/terraform/components/{callbacks-clients => cbc}/outputs.tf (100%) rename infrastructure/terraform/components/{callbacks-clients => cbc}/pre.sh (100%) rename infrastructure/terraform/components/{callbacks-clients => cbc}/provider_aws.tf (100%) rename infrastructure/terraform/components/{callbacks-clients => cbc}/s3_object_client_config.tf (100%) rename infrastructure/terraform/components/{callbacks-clients => cbc}/ssm_parameter_applications_map.tf (100%) rename infrastructure/terraform/components/{callbacks-clients => cbc}/sync-client-config.sh (100%) rename infrastructure/terraform/components/{callbacks-clients => cbc}/variables.tf (98%) rename infrastructure/terraform/components/{callbacks-clients => cbc}/versions.tf (100%) diff --git a/.github/workflows/cicd-1-pull-request.yaml b/.github/workflows/cicd-1-pull-request.yaml index d293f0bd..3697908c 100644 --- a/.github/workflows/cicd-1-pull-request.yaml +++ b/.github/workflows/cicd-1-pull-request.yaml @@ -183,7 +183,7 @@ jobs: APP_PEM_FILE: ${{ secrets.APP_PEM_FILE }} steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - - name: Trigger dynamic environment creation (callbacks-clients) + - name: Trigger dynamic environment creation (cbc) shell: bash run: | .github/scripts/dispatch_internal_repo_workflow.sh \ @@ -191,7 +191,7 @@ jobs: --releaseVersion "${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}" \ --targetWorkflow "dispatch-deploy-dynamic-env.yaml" \ --targetEnvironment "pr${{ needs.metadata.outputs.pr_number }}" \ - --targetComponent "callbacks-clients" \ + --targetComponent "cbc" \ --targetAccountGroup "nhs-notify-client-callbacks-dev" \ --terraformAction "apply" \ --overrideProjectName "nhs" \ diff --git a/.github/workflows/pr_closed.yml b/.github/workflows/pr_closed.yml index bdcfc4b4..a0b2c396 100644 --- a/.github/workflows/pr_closed.yml +++ b/.github/workflows/pr_closed.yml @@ -46,7 +46,7 @@ jobs: strategy: max-parallel: 1 matrix: - component: [callbacks, callbacks-clients] + component: [callbacks, cbc] steps: - name: Checkout repository diff --git a/.github/workflows/pr_destroy_dynamic_env.yml b/.github/workflows/pr_destroy_dynamic_env.yml index 7aeb3d1f..350d5316 100644 --- a/.github/workflows/pr_destroy_dynamic_env.yml +++ b/.github/workflows/pr_destroy_dynamic_env.yml @@ -19,7 +19,7 @@ jobs: steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - - name: Trigger dynamic environment destroy (callbacks-clients) + - name: Trigger dynamic environment destroy (cbc) env: APP_PEM_FILE: ${{ secrets.APP_PEM_FILE }} APP_CLIENT_ID: ${{ secrets.APP_CLIENT_ID }} @@ -30,7 +30,7 @@ jobs: --releaseVersion "${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}" \ --targetWorkflow "dispatch-deploy-dynamic-env.yaml" \ --targetEnvironment "pr${{ github.event.number }}" \ - --targetComponent "callbacks-clients" \ + --targetComponent "cbc" \ --targetAccountGroup "nhs-notify-client-callbacks-dev" \ --terraformAction "destroy" \ --overrideProjectName "nhs" \ diff --git a/.github/workflows/release_created.yml b/.github/workflows/release_created.yml index eb18f897..d29c9a66 100644 --- a/.github/workflows/release_created.yml +++ b/.github/workflows/release_created.yml @@ -22,7 +22,7 @@ jobs: strategy: max-parallel: 1 matrix: - component: [callbacks, callbacks-clients] + component: [callbacks, cbc] steps: - name: Checkout repository diff --git a/infrastructure/terraform/components/callbacks-clients/README.md b/infrastructure/terraform/components/callbacks-clients/README.md index 2f5580bb..df8c1f5c 100644 --- a/infrastructure/terraform/components/callbacks-clients/README.md +++ b/infrastructure/terraform/components/callbacks-clients/README.md @@ -4,40 +4,16 @@ ## Requirements -| Name | Version | -|------|---------| -| [terraform](#requirement\_terraform) | >= 1.10.1 | -| [aws](#requirement\_aws) | 6.13 | -| [random](#requirement\_random) | ~> 3.0 | +No requirements. ## Inputs -| Name | Description | Type | Default | Required | -|------|-------------|------|---------|:--------:| -| [aws\_account\_id](#input\_aws\_account\_id) | The AWS Account ID (numeric) | `string` | n/a | yes | -| [component](#input\_component) | The variable encapsulating the name of this component | `string` | `"callbacks-clients"` | no | -| [default\_tags](#input\_default\_tags) | A map of default tags to apply to all taggable resources within the component | `map(string)` | `{}` | no | -| [deploy\_mock\_clients](#input\_deploy\_mock\_clients) | Flag to deploy mock client subscription config for integration testing (test/dev environments only) | `bool` | `false` | no | -| [deploy\_perf\_runner](#input\_deploy\_perf\_runner) | Flag to deploy performance test client subscription fixtures | `bool` | `false` | no | -| [enable\_xray\_tracing](#input\_enable\_xray\_tracing) | Enable AWS X-Ray active tracing for Lambda functions | `bool` | `false` | no | -| [environment](#input\_environment) | The name of the tfscaffold environment | `string` | n/a | yes | -| [force\_lambda\_code\_deploy](#input\_force\_lambda\_code\_deploy) | If the lambda package in s3 has the same commit id tag as the terraform build branch, the lambda will not update automatically. Set to True if making changes to Lambda code from on the same commit for example during development | `bool` | `false` | no | -| [group](#input\_group) | The group variables are being inherited from (often synonmous with account short-name) | `string` | n/a | yes | -| [log\_level](#input\_log\_level) | The log level to be used in lambda functions within the component | `string` | `"INFO"` | no | -| [log\_retention\_in\_days](#input\_log\_retention\_in\_days) | The retention period in days for the Cloudwatch Logs events to be retained, default of 0 is indefinite | `number` | `0` | no | -| [parent\_acct\_environment](#input\_parent\_acct\_environment) | Name of the environment responsible for the acct resources used, affects things like DNS zone. Useful for named dev environments | `string` | `"main"` | no | -| [project](#input\_project) | The name of the tfscaffold project | `string` | n/a | yes | -| [region](#input\_region) | The AWS Region | `string` | n/a | yes | -| [token\_bucket\_burst\_capacity](#input\_token\_bucket\_burst\_capacity) | Token bucket burst capacity used by the rate limiter | `number` | `2250` | no | +No inputs. ## Modules -| Name | Source | Version | -|------|--------|---------| -| [client\_delivery](#module\_client\_delivery) | ../../modules/client-delivery | n/a | +No modules. ## Outputs -| Name | Description | -|------|-------------| -| [deployment](#output\_deployment) | Deployment details used for post-deployment scripts | +No outputs. diff --git a/infrastructure/terraform/components/callbacks/module_perf_runner_lambda.tf b/infrastructure/terraform/components/callbacks/module_perf_runner_lambda.tf index 6fee9bf7..4795bc8d 100644 --- a/infrastructure/terraform/components/callbacks/module_perf_runner_lambda.tf +++ b/infrastructure/terraform/components/callbacks/module_perf_runner_lambda.tf @@ -39,8 +39,8 @@ module "perf_runner_lambda" { ENVIRONMENT = var.environment INBOUND_QUEUE_URL = module.sqs_inbound_event.sqs_queue_url TRANSFORM_FILTER_LOG_GROUP = module.client_transform_filter_lambda.cloudwatch_log_group_name - DELIVERY_LOG_GROUP_PREFIX = "/aws/lambda/${var.project}-${var.environment}-callbacks-clients-https-client-" - DELIVERY_QUEUE_URL_PREFIX = "https://sqs.${var.region}.amazonaws.com/${var.aws_account_id}/${var.project}-${var.environment}-callbacks-clients-" + DELIVERY_LOG_GROUP_PREFIX = "/aws/lambda/${var.project}-${var.environment}-cbc-https-client-" + DELIVERY_QUEUE_URL_PREFIX = "https://sqs.${var.region}.amazonaws.com/${var.aws_account_id}/${var.project}-${var.environment}-cbc-" MOCK_WEBHOOK_LOG_GROUP = var.deploy_mock_clients ? module.mock_webhook_lambda[0].cloudwatch_log_group_name : "" ELASTICACHE_ENDPOINT = aws_elasticache_serverless_cache.delivery_state.endpoint[0].address ELASTICACHE_CACHE_NAME = aws_elasticache_serverless_cache.delivery_state.name @@ -95,8 +95,8 @@ data "aws_iam_policy_document" "perf_runner_lambda" { resources = [ module.sqs_inbound_event.sqs_queue_arn, "${module.sqs_inbound_event.sqs_queue_arn}-dlq", - "arn:aws:sqs:${var.region}:${var.aws_account_id}:${var.project}-${var.environment}-callbacks-clients-*-delivery-queue", - "arn:aws:sqs:${var.region}:${var.aws_account_id}:${var.project}-${var.environment}-callbacks-clients-*-delivery-dlq-queue", + "arn:aws:sqs:${var.region}:${var.aws_account_id}:${var.project}-${var.environment}-cbc-*-delivery-queue", + "arn:aws:sqs:${var.region}:${var.aws_account_id}:${var.project}-${var.environment}-cbc-*-delivery-dlq-queue", ] } @@ -112,7 +112,7 @@ data "aws_iam_policy_document" "perf_runner_lambda" { resources = concat( [ "arn:aws:logs:${var.region}:${var.aws_account_id}:log-group:${module.client_transform_filter_lambda.cloudwatch_log_group_name}:*", - "arn:aws:logs:${var.region}:${var.aws_account_id}:log-group:/aws/lambda/${var.project}-${var.environment}-callbacks-clients-https-client-*", + "arn:aws:logs:${var.region}:${var.aws_account_id}:log-group:/aws/lambda/${var.project}-${var.environment}-cbc-https-client-*", ], var.deploy_mock_clients ? [ "arn:aws:logs:${var.region}:${var.aws_account_id}:log-group:${module.mock_webhook_lambda[0].cloudwatch_log_group_name}:*", diff --git a/infrastructure/terraform/components/callbacks/outputs.tf b/infrastructure/terraform/components/callbacks/outputs.tf index 9d2d8a4c..359e77d2 100644 --- a/infrastructure/terraform/components/callbacks/outputs.tf +++ b/infrastructure/terraform/components/callbacks/outputs.tf @@ -24,7 +24,7 @@ output "mock_webhook_lambda_log_group_name" { } ## -# Shared outputs consumed by the callbacks-clients component via terraform_remote_state. +# Shared outputs consumed by the cbc component via terraform_remote_state. ## output "kms_key_arn" { diff --git a/infrastructure/terraform/components/callbacks-clients/.tool-versions b/infrastructure/terraform/components/cbc/.tool-versions similarity index 100% rename from infrastructure/terraform/components/callbacks-clients/.tool-versions rename to infrastructure/terraform/components/cbc/.tool-versions diff --git a/infrastructure/terraform/components/cbc/README.md b/infrastructure/terraform/components/cbc/README.md new file mode 100644 index 00000000..8c604b55 --- /dev/null +++ b/infrastructure/terraform/components/cbc/README.md @@ -0,0 +1,43 @@ + + + + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 1.10.1 | +| [aws](#requirement\_aws) | 6.13 | +| [random](#requirement\_random) | ~> 3.0 | +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [aws\_account\_id](#input\_aws\_account\_id) | The AWS Account ID (numeric) | `string` | n/a | yes | +| [component](#input\_component) | The variable encapsulating the name of this component | `string` | `"cbc"` | no | +| [default\_tags](#input\_default\_tags) | A map of default tags to apply to all taggable resources within the component | `map(string)` | `{}` | no | +| [deploy\_mock\_clients](#input\_deploy\_mock\_clients) | Flag to deploy mock client subscription config for integration testing (test/dev environments only) | `bool` | `false` | no | +| [deploy\_perf\_runner](#input\_deploy\_perf\_runner) | Flag to deploy performance test client subscription fixtures | `bool` | `false` | no | +| [enable\_xray\_tracing](#input\_enable\_xray\_tracing) | Enable AWS X-Ray active tracing for Lambda functions | `bool` | `false` | no | +| [environment](#input\_environment) | The name of the tfscaffold environment | `string` | n/a | yes | +| [force\_lambda\_code\_deploy](#input\_force\_lambda\_code\_deploy) | If the lambda package in s3 has the same commit id tag as the terraform build branch, the lambda will not update automatically. Set to True if making changes to Lambda code from on the same commit for example during development | `bool` | `false` | no | +| [group](#input\_group) | The group variables are being inherited from (often synonmous with account short-name) | `string` | n/a | yes | +| [log\_level](#input\_log\_level) | The log level to be used in lambda functions within the component | `string` | `"INFO"` | no | +| [log\_retention\_in\_days](#input\_log\_retention\_in\_days) | The retention period in days for the Cloudwatch Logs events to be retained, default of 0 is indefinite | `number` | `0` | no | +| [parent\_acct\_environment](#input\_parent\_acct\_environment) | Name of the environment responsible for the acct resources used, affects things like DNS zone. Useful for named dev environments | `string` | `"main"` | no | +| [project](#input\_project) | The name of the tfscaffold project | `string` | n/a | yes | +| [region](#input\_region) | The AWS Region | `string` | n/a | yes | +| [token\_bucket\_burst\_capacity](#input\_token\_bucket\_burst\_capacity) | Token bucket burst capacity used by the rate limiter | `number` | `2250` | no | +## Modules + +| Name | Source | Version | +|------|--------|---------| +| [client\_delivery](#module\_client\_delivery) | ../../modules/client-delivery | n/a | +## Outputs + +| Name | Description | +|------|-------------| +| [deployment](#output\_deployment) | Deployment details used for post-deployment scripts | + + + diff --git a/infrastructure/terraform/components/callbacks-clients/_paths.sh b/infrastructure/terraform/components/cbc/_paths.sh similarity index 100% rename from infrastructure/terraform/components/callbacks-clients/_paths.sh rename to infrastructure/terraform/components/cbc/_paths.sh diff --git a/infrastructure/terraform/components/cbc/locals.tf b/infrastructure/terraform/components/cbc/locals.tf new file mode 100644 index 00000000..ee33b2ba --- /dev/null +++ b/infrastructure/terraform/components/cbc/locals.tf @@ -0,0 +1,57 @@ +locals { + aws_lambda_functions_dir_path = "../../../../lambdas" + + clients_dir_path = "${path.module}/../../modules/clients" + + config_clients = merge([ + for filename in fileset(local.clients_dir_path, "*.json") : { + (replace(filename, ".json", "")) = jsondecode(file("${local.clients_dir_path}/${filename}")) + } + ]...) + + # When deploying mock clients, replace sentinel placeholder values with the mock webhook URL and API key. + # Only used for S3 object content — must not be used as a for_each source (contains apply-time values). + enriched_mock_config_clients = var.deploy_mock_clients ? { + for client_id, client in local.config_clients : + client_id => merge(client, { + targets = [ + for target in try(client.targets, []) : + merge(target, { + invocationEndpoint = "https://${local.callbacks.mock_webhook_alb_dns_name}/${target.targetId}" + apiKey = merge(target.apiKey, { headerValue = local.callbacks.mock_webhook_api_key }) + delivery = merge(try(target.delivery, {}), { + mtls = merge(try(target.delivery.mtls, {}), { + certPinning = merge(try(target.delivery.mtls.certPinning, {}), try(target.delivery.mtls.certPinning.enabled, false) ? { + spkiHash = local.callbacks.mock_server_spki_hash + } : {}) + }) + }) + }) + ] + }) + } : local.config_clients + + client_subscriptions = { + for client_id, data in local.config_clients : + client_id => { + for subscription in try(data.subscriptions, []) : + subscription.subscriptionId => { + subscription_id = subscription.subscriptionId + target_ids = try(subscription.targetIds, []) + } + } + } + + client_subscription_targets = { + for client_id, data in local.config_clients : + client_id => merge([ + for subscription in try(data.subscriptions, []) : { + for target_id in try(subscription.targetIds, []) : + "${subscription.subscriptionId}-${target_id}" => { + subscription_id = subscription.subscriptionId + target_id = target_id + } + } + ]...) + } +} diff --git a/infrastructure/terraform/components/callbacks-clients/locals_remote_state.tf b/infrastructure/terraform/components/cbc/locals_remote_state.tf similarity index 100% rename from infrastructure/terraform/components/callbacks-clients/locals_remote_state.tf rename to infrastructure/terraform/components/cbc/locals_remote_state.tf diff --git a/infrastructure/terraform/components/callbacks-clients/locals_tfscaffold.tf b/infrastructure/terraform/components/cbc/locals_tfscaffold.tf similarity index 100% rename from infrastructure/terraform/components/callbacks-clients/locals_tfscaffold.tf rename to infrastructure/terraform/components/cbc/locals_tfscaffold.tf diff --git a/infrastructure/terraform/components/callbacks-clients/module_client_delivery.tf b/infrastructure/terraform/components/cbc/module_client_delivery.tf similarity index 100% rename from infrastructure/terraform/components/callbacks-clients/module_client_delivery.tf rename to infrastructure/terraform/components/cbc/module_client_delivery.tf diff --git a/infrastructure/terraform/components/callbacks-clients/outputs.tf b/infrastructure/terraform/components/cbc/outputs.tf similarity index 100% rename from infrastructure/terraform/components/callbacks-clients/outputs.tf rename to infrastructure/terraform/components/cbc/outputs.tf diff --git a/infrastructure/terraform/components/callbacks-clients/pre.sh b/infrastructure/terraform/components/cbc/pre.sh similarity index 100% rename from infrastructure/terraform/components/callbacks-clients/pre.sh rename to infrastructure/terraform/components/cbc/pre.sh diff --git a/infrastructure/terraform/components/callbacks-clients/provider_aws.tf b/infrastructure/terraform/components/cbc/provider_aws.tf similarity index 100% rename from infrastructure/terraform/components/callbacks-clients/provider_aws.tf rename to infrastructure/terraform/components/cbc/provider_aws.tf diff --git a/infrastructure/terraform/components/callbacks-clients/s3_object_client_config.tf b/infrastructure/terraform/components/cbc/s3_object_client_config.tf similarity index 100% rename from infrastructure/terraform/components/callbacks-clients/s3_object_client_config.tf rename to infrastructure/terraform/components/cbc/s3_object_client_config.tf diff --git a/infrastructure/terraform/components/callbacks-clients/ssm_parameter_applications_map.tf b/infrastructure/terraform/components/cbc/ssm_parameter_applications_map.tf similarity index 100% rename from infrastructure/terraform/components/callbacks-clients/ssm_parameter_applications_map.tf rename to infrastructure/terraform/components/cbc/ssm_parameter_applications_map.tf diff --git a/infrastructure/terraform/components/callbacks-clients/sync-client-config.sh b/infrastructure/terraform/components/cbc/sync-client-config.sh similarity index 100% rename from infrastructure/terraform/components/callbacks-clients/sync-client-config.sh rename to infrastructure/terraform/components/cbc/sync-client-config.sh diff --git a/infrastructure/terraform/components/callbacks-clients/variables.tf b/infrastructure/terraform/components/cbc/variables.tf similarity index 98% rename from infrastructure/terraform/components/callbacks-clients/variables.tf rename to infrastructure/terraform/components/cbc/variables.tf index 31502b14..790020b7 100644 --- a/infrastructure/terraform/components/callbacks-clients/variables.tf +++ b/infrastructure/terraform/components/cbc/variables.tf @@ -34,7 +34,7 @@ variable "group" { variable "component" { type = string description = "The variable encapsulating the name of this component" - default = "callbacks-clients" + default = "cbc" } variable "default_tags" { diff --git a/infrastructure/terraform/components/callbacks-clients/versions.tf b/infrastructure/terraform/components/cbc/versions.tf similarity index 100% rename from infrastructure/terraform/components/callbacks-clients/versions.tf rename to infrastructure/terraform/components/cbc/versions.tf diff --git a/lambdas/perf-runner-lambda/src/__tests__/index.test.ts b/lambdas/perf-runner-lambda/src/__tests__/index.test.ts index dbabd465..b1edc297 100644 --- a/lambdas/perf-runner-lambda/src/__tests__/index.test.ts +++ b/lambdas/perf-runner-lambda/src/__tests__/index.test.ts @@ -42,9 +42,9 @@ beforeEach(() => { process.env.TRANSFORM_FILTER_LOG_GROUP = "/aws/lambda/nhs-dev-callbacks-client-transform-filter"; process.env.DELIVERY_LOG_GROUP_PREFIX = - "/aws/lambda/nhs-dev-callbacks-clients-https-client-"; + "/aws/lambda/nhs-dev-cbc-https-client-"; process.env.DELIVERY_QUEUE_URL_PREFIX = - "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-clients-"; + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-cbc-"; process.env.MOCK_WEBHOOK_LOG_GROUP = "/aws/lambda/nhs-dev-callbacks-mock-webhook"; process.env.ELASTICACHE_ENDPOINT = "cache.example.invalid"; @@ -62,10 +62,9 @@ describe("handler", () => { expect.objectContaining({ queueUrl: "https://sqs.example.invalid/queue", logGroupName: "/aws/lambda/nhs-dev-callbacks-client-transform-filter", - deliveryLogGroupPrefix: - "/aws/lambda/nhs-dev-callbacks-clients-https-client-", + deliveryLogGroupPrefix: "/aws/lambda/nhs-dev-cbc-https-client-", deliveryQueueUrlPrefix: - "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-clients-", + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-cbc-", mockWebhookLogGroup: "/aws/lambda/nhs-dev-callbacks-mock-webhook", }), DEFAULT_SCENARIO, diff --git a/lambdas/perf-runner-lambda/src/__tests__/purge.test.ts b/lambdas/perf-runner-lambda/src/__tests__/purge.test.ts index 5cad33e4..016779ea 100644 --- a/lambdas/perf-runner-lambda/src/__tests__/purge.test.ts +++ b/lambdas/perf-runner-lambda/src/__tests__/purge.test.ts @@ -25,7 +25,7 @@ const inboundQueueUrl = "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-inbound-event-queue"; const deliveryQueueUrlPrefix = - "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-clients-"; + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-cbc-"; describe("deriveQueueUrls", () => { it("derives all queue URLs from the inbound queue URL, scenario and delivery prefix", () => { @@ -38,10 +38,10 @@ describe("deriveQueueUrls", () => { expect(urls).toEqual([ "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-inbound-event-queue", "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-inbound-event-dlq-queue", - "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-clients-perf-client-1-delivery-queue", - "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-clients-perf-client-1-delivery-dlq-queue", - "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-clients-perf-client-2-delivery-queue", - "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-clients-perf-client-2-delivery-dlq-queue", + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-cbc-perf-client-1-delivery-queue", + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-cbc-perf-client-1-delivery-dlq-queue", + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-cbc-perf-client-2-delivery-queue", + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-cbc-perf-client-2-delivery-dlq-queue", ]); }); @@ -73,8 +73,8 @@ describe("deriveQueueUrls", () => { expect(urls).toEqual([ "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-inbound-event-queue", "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-inbound-event-dlq-queue", - "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-clients-perf-client-1-delivery-queue", - "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-clients-perf-client-1-delivery-dlq-queue", + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-cbc-perf-client-1-delivery-queue", + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-cbc-perf-client-1-delivery-dlq-queue", ]); }); diff --git a/lambdas/perf-runner-lambda/src/__tests__/runner.test.ts b/lambdas/perf-runner-lambda/src/__tests__/runner.test.ts index 5551f7dc..23c720de 100644 --- a/lambdas/perf-runner-lambda/src/__tests__/runner.test.ts +++ b/lambdas/perf-runner-lambda/src/__tests__/runner.test.ts @@ -98,9 +98,9 @@ const deps: RunnerDeps = { cloudWatchClient: {} as CloudWatchLogsClient, queueUrl: "https://sqs.example.invalid/queue", logGroupName: "/aws/lambda/nhs-dev-callbacks-client-transform-filter", - deliveryLogGroupPrefix: "/aws/lambda/nhs-dev-callbacks-clients-https-client-", + deliveryLogGroupPrefix: "/aws/lambda/nhs-dev-cbc-https-client-", deliveryQueueUrlPrefix: - "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-clients-", + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-cbc-", }; beforeEach(() => { @@ -355,8 +355,8 @@ describe("runPerformanceTest", () => { expect(mockQueryDeliveryMetricsSnapshot).toHaveBeenCalledWith( deps.cloudWatchClient, expect.arrayContaining([ - "/aws/lambda/nhs-dev-callbacks-clients-https-client-perf-client-1", - "/aws/lambda/nhs-dev-callbacks-clients-https-client-perf-client-2", + "/aws/lambda/nhs-dev-cbc-https-client-perf-client-1", + "/aws/lambda/nhs-dev-cbc-https-client-perf-client-2", ]), expect.any(Number), expect.any(Number), @@ -482,13 +482,13 @@ describe("runPerformanceTest", () => { expect(mockQueryPerClientRateTimeline).toHaveBeenCalledTimes(2); expect(mockQueryPerClientRateTimeline).toHaveBeenCalledWith( deps.cloudWatchClient, - "/aws/lambda/nhs-dev-callbacks-clients-https-client-perf-client-1", + "/aws/lambda/nhs-dev-cbc-https-client-perf-client-1", expect.any(Number), expect.any(Number), ); expect(mockQueryPerClientRateTimeline).toHaveBeenCalledWith( deps.cloudWatchClient, - "/aws/lambda/nhs-dev-callbacks-clients-https-client-perf-client-2", + "/aws/lambda/nhs-dev-cbc-https-client-perf-client-2", expect.any(Number), expect.any(Number), ); diff --git a/scripts/tests/integration-debug.sh b/scripts/tests/integration-debug.sh index 34f1e8d1..f375be1a 100755 --- a/scripts/tests/integration-debug.sh +++ b/scripts/tests/integration-debug.sh @@ -59,7 +59,7 @@ fi ACCOUNT_ID="$(aws sts get-caller-identity --profile "$AWS_PROFILE" --query Account --output text)" PREFIX="nhs-${ENVIRONMENT}-callbacks" -CLIENT_PREFIX="nhs-${ENVIRONMENT}-callbacks-clients" +CLIENT_PREFIX="nhs-${ENVIRONMENT}-cbc" PIPE_NAME="${PREFIX}-main" print_section() { From a2a519bc3c32df606d0177c7f348c5e310839ef7 Mon Sep 17 00:00:00 2001 From: Rhys Cox Date: Wed, 29 Apr 2026 10:44:53 +0100 Subject: [PATCH 8/9] CCM-16073 - Fixed resource paths --- tests/integration/delivery-resilience.test.ts | 4 ++-- tests/integration/dlq-alarms.test.ts | 4 ++-- tests/integration/helpers/sqs.ts | 10 ++++++++-- tests/integration/helpers/test-context.ts | 6 ++++++ tests/integration/metrics.test.ts | 2 +- tests/test-support/helpers/deployment.ts | 11 ++++++++++- 6 files changed, 29 insertions(+), 8 deletions(-) diff --git a/tests/integration/delivery-resilience.test.ts b/tests/integration/delivery-resilience.test.ts index 8b218233..8fa510e5 100644 --- a/tests/integration/delivery-resilience.test.ts +++ b/tests/integration/delivery-resilience.test.ts @@ -117,7 +117,7 @@ describe("Delivery Resilience", () => { const { clientId } = getClientConfig("clientRateLimit"); dlqUrl = ctx.clientDlqUrl(clientId); deliveryUrl = ctx.clientDeliveryUrl(clientId); - httpsClientLogGroup = ctx.logGroup(`https-client-${clientId}`); + httpsClientLogGroup = ctx.clientLogGroup(`https-client-${clientId}`); await purgeQueues(ctx.sqs, [dlqUrl, deliveryUrl]); }); @@ -190,7 +190,7 @@ describe("Delivery Resilience", () => { const { clientId } = getClientConfig("clientCircuitBreaker"); dlqUrl = ctx.clientDlqUrl(clientId); deliveryUrl = ctx.clientDeliveryUrl(clientId); - httpsClientLogGroup = ctx.logGroup(`https-client-${clientId}`); + httpsClientLogGroup = ctx.clientLogGroup(`https-client-${clientId}`); await purgeQueues(ctx.sqs, [dlqUrl, deliveryUrl]); }); diff --git a/tests/integration/dlq-alarms.test.ts b/tests/integration/dlq-alarms.test.ts index c4f69fa8..ae1e1bff 100644 --- a/tests/integration/dlq-alarms.test.ts +++ b/tests/integration/dlq-alarms.test.ts @@ -13,10 +13,10 @@ import { import { buildMockClientDlqQueueUrl } from "./helpers/sqs"; function buildDlqDepthAlarmName( - { component, environment, project }: DeploymentDetails, + { clientComponent, environment, project }: DeploymentDetails, clientId: string, ): string { - return `${project}-${environment}-${component}-${clientId}-dlq-depth`; + return `${project}-${environment}-${clientComponent}-${clientId}-dlq-depth`; } function getQueueNameFromUrl(queueUrl: string): string { diff --git a/tests/integration/helpers/sqs.ts b/tests/integration/helpers/sqs.ts index 5cdcc3a9..2f35b7e5 100644 --- a/tests/integration/helpers/sqs.ts +++ b/tests/integration/helpers/sqs.ts @@ -49,14 +49,20 @@ export function buildMockClientDlqQueueUrl( deploymentDetails: DeploymentDetails, clientId: string, ): string { - return buildQueueUrl(deploymentDetails, `${clientId}-delivery-dlq`); + return buildQueueUrl( + { ...deploymentDetails, component: deploymentDetails.clientComponent }, + `${clientId}-delivery-dlq`, + ); } export function buildMockClientDeliveryQueueUrl( deploymentDetails: DeploymentDetails, clientId: string, ): string { - return buildQueueUrl(deploymentDetails, `${clientId}-delivery`); + return buildQueueUrl( + { ...deploymentDetails, component: deploymentDetails.clientComponent }, + `${clientId}-delivery`, + ); } export async function sendSqsEvent( diff --git a/tests/integration/helpers/test-context.ts b/tests/integration/helpers/test-context.ts index df5a31f5..c55ebdf6 100644 --- a/tests/integration/helpers/test-context.ts +++ b/tests/integration/helpers/test-context.ts @@ -25,10 +25,15 @@ export type TestContext = { clientDlqUrl(clientId: string): string; clientDeliveryUrl(clientId: string): string; logGroup(name: string): string; + clientLogGroup(name: string): string; }; export function createTestContext(): TestContext { const deployment = getDeploymentDetails(); + const clientDeployment = { + ...deployment, + component: deployment.clientComponent, + }; return { sqs: createSqsClient(deployment), @@ -43,6 +48,7 @@ export function createTestContext(): TestContext { clientDeliveryUrl: (clientId) => buildMockClientDeliveryQueueUrl(deployment, clientId), logGroup: (name) => buildLambdaLogGroupName(deployment, name), + clientLogGroup: (name) => buildLambdaLogGroupName(clientDeployment, name), }; } diff --git a/tests/integration/metrics.test.ts b/tests/integration/metrics.test.ts index 20e1dfb8..cd99588b 100644 --- a/tests/integration/metrics.test.ts +++ b/tests/integration/metrics.test.ts @@ -118,7 +118,7 @@ describe("Metrics", () => { beforeAll(() => { const { clientId } = getClientConfig("clientSingleTarget"); - httpsClientLogGroup = ctx.logGroup(`https-client-${clientId}`); + httpsClientLogGroup = ctx.clientLogGroup(`https-client-${clientId}`); }); it("should emit DeliveryAttempt, DeliverySuccess and DeliveryDurationMs on successful delivery", async () => { diff --git a/tests/test-support/helpers/deployment.ts b/tests/test-support/helpers/deployment.ts index 20bf1f59..9a548f70 100644 --- a/tests/test-support/helpers/deployment.ts +++ b/tests/test-support/helpers/deployment.ts @@ -3,6 +3,7 @@ export type DeploymentDetails = { environment: string; project: string; component: string; + clientComponent: string; accountId: string; }; @@ -11,6 +12,7 @@ export function getDeploymentDetails(): DeploymentDetails { const environment = process.env.ENVIRONMENT; const project = process.env.PROJECT ?? "nhs"; const component = process.env.COMPONENT ?? "callbacks"; + const clientComponent = process.env.CLIENT_COMPONENT ?? "cbc"; const accountId = process.env.AWS_ACCOUNT_ID; if (!environment) { @@ -21,7 +23,14 @@ export function getDeploymentDetails(): DeploymentDetails { throw new Error("AWS_ACCOUNT_ID environment variable must be set"); } - return { region, environment, project, component, accountId }; + return { + region, + environment, + project, + component, + clientComponent, + accountId, + }; } export function buildSubscriptionConfigBucketName({ From b7439395020065ac6a17479ab3ee994d42602f7a Mon Sep 17 00:00:00 2001 From: Rhys Cox Date: Wed, 29 Apr 2026 11:01:24 +0100 Subject: [PATCH 9/9] CCM-16073 - Fixed sqs permissions --- infrastructure/terraform/components/callbacks/module_kms.tf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/infrastructure/terraform/components/callbacks/module_kms.tf b/infrastructure/terraform/components/callbacks/module_kms.tf index 327b5641..778d0e3d 100644 --- a/infrastructure/terraform/components/callbacks/module_kms.tf +++ b/infrastructure/terraform/components/callbacks/module_kms.tf @@ -66,7 +66,8 @@ data "aws_iam_policy_document" "kms" { values = [ "arn:aws:sqs:${var.region}:${var.aws_account_id}:${var.project}-${var.environment}-callbacks-inbound-event-queue", "arn:aws:sqs:${var.region}:${var.aws_account_id}:${var.project}-${var.environment}-callbacks-inbound-event-dlq", - "arn:aws:sqs:${var.region}:${var.aws_account_id}:${var.project}-${var.environment}-callbacks-*-dlq-queue" #wildcard here so that DLQs for clients can also use this key + "arn:aws:sqs:${var.region}:${var.aws_account_id}:${var.project}-${var.environment}-cbc-*-delivery-queue", + "arn:aws:sqs:${var.region}:${var.aws_account_id}:${var.project}-${var.environment}-cbc-*-delivery-dlq-queue", ] } }