diff --git a/infrastructure/terraform/components/callbacks/README.md b/infrastructure/terraform/components/callbacks/README.md
index 02804698..e090abb9 100644
--- a/infrastructure/terraform/components/callbacks/README.md
+++ b/infrastructure/terraform/components/callbacks/README.md
@@ -45,6 +45,7 @@
| [s3\_enable\_force\_destroy](#input\_s3\_enable\_force\_destroy) | Whether to enable force destroy for the S3 buckets created in this module | `bool` | `false` | no |
| [sqs\_inbound\_event\_max\_receive\_count](#input\_sqs\_inbound\_event\_max\_receive\_count) | n/a | `number` | `3` | no |
| [sqs\_inbound\_event\_visibility\_timeout\_seconds](#input\_sqs\_inbound\_event\_visibility\_timeout\_seconds) | n/a | `number` | `60` | no |
+| [token\_bucket\_burst\_capacity](#input\_token\_bucket\_burst\_capacity) | Token bucket burst capacity used by the rate limiter | `number` | `2250` | no |
## Modules
| Name | Source | Version |
diff --git a/infrastructure/terraform/components/callbacks/module_client_delivery.tf b/infrastructure/terraform/components/callbacks/module_client_delivery.tf
index ebc2e9e1..5122606e 100644
--- a/infrastructure/terraform/components/callbacks/module_client_delivery.tf
+++ b/infrastructure/terraform/components/callbacks/module_client_delivery.tf
@@ -41,6 +41,8 @@ module "client_delivery" {
mtls_test_cert_s3_key = local.mtls_test_cert_s3_key # gitleaks:allow
mtls_test_ca_s3_key = local.mtls_test_ca_s3_key # gitleaks:allow
+ token_bucket_burst_capacity = var.token_bucket_burst_capacity
+
vpc_subnet_ids = try(local.acct.private_subnets[local.bc_name], [])
lambda_security_group_id = aws_security_group.https_client_lambda.id
}
diff --git a/infrastructure/terraform/components/callbacks/module_perf_runner_lambda.tf b/infrastructure/terraform/components/callbacks/module_perf_runner_lambda.tf
index 424294a8..7a77c40c 100644
--- a/infrastructure/terraform/components/callbacks/module_perf_runner_lambda.tf
+++ b/infrastructure/terraform/components/callbacks/module_perf_runner_lambda.tf
@@ -40,6 +40,15 @@ module "perf_runner_lambda" {
INBOUND_QUEUE_URL = module.sqs_inbound_event.sqs_queue_url
TRANSFORM_FILTER_LOG_GROUP = module.client_transform_filter_lambda.cloudwatch_log_group_name
DELIVERY_LOG_GROUP_PREFIX = "/aws/lambda/${local.csi}-https-client-"
+ MOCK_WEBHOOK_LOG_GROUP = var.deploy_mock_clients ? module.mock_webhook_lambda[0].cloudwatch_log_group_name : ""
+ ELASTICACHE_ENDPOINT = aws_elasticache_serverless_cache.delivery_state.endpoint[0].address
+ ELASTICACHE_CACHE_NAME = aws_elasticache_serverless_cache.delivery_state.name
+ ELASTICACHE_IAM_USERNAME = "${var.project}-${var.environment}-${var.component}-elasticache-user"
+ }
+
+ vpc_config = {
+ subnet_ids = try(local.acct.private_subnets[local.bc_name], [])
+ security_group_ids = [aws_security_group.https_client_lambda.id]
}
}
@@ -74,6 +83,22 @@ data "aws_iam_policy_document" "perf_runner_lambda" {
]
}
+ statement {
+ sid = "SQSPurgeQueue"
+ effect = "Allow"
+
+ actions = [
+ "sqs:PurgeQueue",
+ ]
+
+ resources = [
+ module.sqs_inbound_event.sqs_queue_arn,
+ "${module.sqs_inbound_event.sqs_queue_arn}-dlq",
+ "arn:aws:sqs:${var.region}:${var.aws_account_id}:${local.csi}-*-delivery-queue",
+ "arn:aws:sqs:${var.region}:${var.aws_account_id}:${local.csi}-*-delivery-dlq-queue",
+ ]
+ }
+
statement {
sid = "CloudWatchLogsInsightsQuery"
effect = "Allow"
@@ -83,10 +108,15 @@ data "aws_iam_policy_document" "perf_runner_lambda" {
"logs:StopQuery",
]
- resources = [
- "arn:aws:logs:${var.region}:${var.aws_account_id}:log-group:${module.client_transform_filter_lambda.cloudwatch_log_group_name}:*",
- "arn:aws:logs:${var.region}:${var.aws_account_id}:log-group:/aws/lambda/${local.csi}-https-client-*",
- ]
+ resources = concat(
+ [
+ "arn:aws:logs:${var.region}:${var.aws_account_id}:log-group:${module.client_transform_filter_lambda.cloudwatch_log_group_name}:*",
+ "arn:aws:logs:${var.region}:${var.aws_account_id}:log-group:/aws/lambda/${local.csi}-https-client-*",
+ ],
+ var.deploy_mock_clients ? [
+ "arn:aws:logs:${var.region}:${var.aws_account_id}:log-group:${module.mock_webhook_lambda[0].cloudwatch_log_group_name}:*",
+ ] : [],
+ )
}
statement {
@@ -99,4 +129,33 @@ data "aws_iam_policy_document" "perf_runner_lambda" {
resources = ["*"]
}
+
+ statement {
+ sid = "ElastiCacheConnect"
+ effect = "Allow"
+
+ actions = [
+ "elasticache:Connect",
+ ]
+
+ resources = [
+ aws_elasticache_serverless_cache.delivery_state.arn,
+ aws_elasticache_user.delivery_state_iam.arn,
+ ]
+ }
+
+ statement {
+ sid = "VPCNetworkInterfacePermissions"
+ effect = "Allow"
+
+ actions = [
+ "ec2:CreateNetworkInterface",
+ "ec2:DeleteNetworkInterface",
+ "ec2:DescribeNetworkInterfaces",
+ ]
+
+ resources = [
+ "*",
+ ]
+ }
}
diff --git a/infrastructure/terraform/components/callbacks/variables.tf b/infrastructure/terraform/components/callbacks/variables.tf
index 9c71492d..aef32373 100644
--- a/infrastructure/terraform/components/callbacks/variables.tf
+++ b/infrastructure/terraform/components/callbacks/variables.tf
@@ -195,3 +195,9 @@ variable "elasticache_data_storage_maximum_gb" {
description = "Maximum data storage in GB for the ElastiCache Serverless delivery state cache"
default = 1
}
+
+variable "token_bucket_burst_capacity" {
+ type = number
+ description = "Token bucket burst capacity used by the rate limiter"
+ default = 2250
+}
diff --git a/infrastructure/terraform/modules/client-delivery/README.md b/infrastructure/terraform/modules/client-delivery/README.md
index 0a4965e7..2036c60d 100644
--- a/infrastructure/terraform/modules/client-delivery/README.md
+++ b/infrastructure/terraform/modules/client-delivery/README.md
@@ -45,6 +45,7 @@ No requirements.
| [sqs\_visibility\_timeout\_seconds](#input\_sqs\_visibility\_timeout\_seconds) | Visibility timeout for the per-client delivery queue | `number` | `60` | no |
| [subscription\_targets](#input\_subscription\_targets) | Flattened subscription-target fanout map keyed by subscription-target composite key |
map(object({
subscription_id = string
target_id = string
})) | n/a | yes |
| [subscriptions](#input\_subscriptions) | Subscription definitions for this client, keyed by subscription\_id | map(object({
subscription_id = string
target_ids = list(string)
})) | n/a | yes |
+| [token\_bucket\_burst\_capacity](#input\_token\_bucket\_burst\_capacity) | Token bucket burst capacity used by the rate limiter | `number` | `2250` | no |
| [vpc\_subnet\_ids](#input\_vpc\_subnet\_ids) | VPC subnet IDs for Lambda execution | `list(string)` | `[]` | no |
## Modules
diff --git a/infrastructure/terraform/modules/client-delivery/module_https_client_lambda.tf b/infrastructure/terraform/modules/client-delivery/module_https_client_lambda.tf
index 1260d471..0021fb80 100644
--- a/infrastructure/terraform/modules/client-delivery/module_https_client_lambda.tf
+++ b/infrastructure/terraform/modules/client-delivery/module_https_client_lambda.tf
@@ -53,6 +53,7 @@ module "https_client_lambda" {
MTLS_TEST_CERT_S3_BUCKET = var.mtls_test_cert_s3_bucket
MTLS_TEST_CERT_S3_KEY = var.mtls_test_cert_s3_key # gitleaks:allow
QUEUE_URL = module.sqs_delivery.sqs_queue_url
+ TOKEN_BUCKET_BURST_CAPACITY = tostring(var.token_bucket_burst_capacity)
}
vpc_config = var.lambda_security_group_id != "" ? {
diff --git a/infrastructure/terraform/modules/client-delivery/variables.tf b/infrastructure/terraform/modules/client-delivery/variables.tf
index 643e163e..801ca291 100644
--- a/infrastructure/terraform/modules/client-delivery/variables.tf
+++ b/infrastructure/terraform/modules/client-delivery/variables.tf
@@ -181,6 +181,12 @@ variable "mtls_test_ca_s3_key" {
default = ""
}
+variable "token_bucket_burst_capacity" {
+ type = number
+ description = "Token bucket burst capacity used by the rate limiter"
+ default = 2250
+}
+
variable "elasticache_endpoint" {
type = string
description = "ElastiCache Serverless endpoint URL"
diff --git a/lambdas/https-client-lambda/src/__tests__/admit-lua.test.ts b/lambdas/https-client-lambda/src/__tests__/admit-lua.test.ts
index 6aab4727..43aa2fb6 100644
--- a/lambdas/https-client-lambda/src/__tests__/admit-lua.test.ts
+++ b/lambdas/https-client-lambda/src/__tests__/admit-lua.test.ts
@@ -1,32 +1,32 @@
import admitLuaSrc from "services/admit.lua";
import { createRedisStore, evalLua } from "__tests__/helpers/lua-redis-mock";
-// ARGV: [now, capacity, refillPerSec, cooldownMs, decayPeriodMs, cbWindowPeriodMs, cbProbeIntervalMs]
-// KEYS: [cbKey, rlKey]
-// Returns: [allowed (0|1), reason, retryAfterMs, effectiveRate]
+// ARGV: [now, capacity, targetRateLimit, cooldownMs, recoveryPeriodMs, probeRateLimit, targetBatchSize]
+// KEYS: [epKey]
+// Returns: [consumedTokens, reason, retryAfterMs, effectiveRate]
type AdmitArgs = {
now: number;
capacity: number;
- refillPerSec: number;
+ targetRateLimit: number;
cooldownMs: number;
- decayPeriodMs: number;
- cbWindowPeriodMs: number;
- cbProbeIntervalMs: number;
+ recoveryPeriodMs: number;
+ probeRateLimit: number;
+ targetBatchSize: number;
};
const defaultArgs: AdmitArgs = {
now: 1_000_000,
- capacity: 10,
- refillPerSec: 10,
- cooldownMs: 60_000,
- decayPeriodMs: 300_000,
- cbWindowPeriodMs: 60_000,
- cbProbeIntervalMs: 60_000,
+ capacity: 2250,
+ targetRateLimit: 10,
+ cooldownMs: 120_000,
+ recoveryPeriodMs: 600_000,
+ probeRateLimit: 1 / 60,
+ targetBatchSize: 1,
};
type AdmitResult = {
- allowed: number;
+ consumedTokens: number;
reason: string;
retryAfterMs: number;
effectiveRate: number;
@@ -40,20 +40,20 @@ function runAdmit(
const merged = { ...defaultArgs, ...args };
const raw = evalLua(
admitLuaSrc,
- [`cb:${targetId}`, `rl:${targetId}`],
+ [`ep:${targetId}`],
[
merged.now.toString(),
merged.capacity.toString(),
- merged.refillPerSec.toString(),
+ merged.targetRateLimit.toString(),
merged.cooldownMs.toString(),
- merged.decayPeriodMs.toString(),
- merged.cbWindowPeriodMs.toString(),
- merged.cbProbeIntervalMs.toString(),
+ merged.recoveryPeriodMs.toString(),
+ merged.probeRateLimit.toString(),
+ merged.targetBatchSize.toString(),
],
store,
) as [number, string, number, number];
return {
- allowed: raw[0],
+ consumedTokens: raw[0],
reason: raw[1],
retryAfterMs: raw[2],
effectiveRate: raw[3],
@@ -62,399 +62,453 @@ function runAdmit(
describe("admit.lua", () => {
describe("rate limiting", () => {
- it("allows the first request with full token bucket", () => {
- const store = createRedisStore();
- const { allowed, effectiveRate, reason, retryAfterMs } = runAdmit(store);
-
- expect(allowed).toBe(1);
- expect(reason).toBe("allowed");
- expect(retryAfterMs).toBe(0);
- expect(effectiveRate).toBe(10);
- });
-
- it("depletes tokens on consecutive calls and rejects when empty", () => {
+ it("enters half-open probe on a fresh endpoint with no prior state", () => {
const store = createRedisStore();
+ const now = 1_000_000;
- for (let i = 0; i < 10; i++) {
- const { allowed } = runAdmit(store);
- expect(allowed).toBe(1);
- }
+ const { consumedTokens, effectiveRate, reason } = runAdmit(store, {
+ now,
+ targetRateLimit: 10,
+ });
- const { allowed, reason } = runAdmit(store);
- expect(allowed).toBe(0);
+ expect(consumedTokens).toBe(0);
expect(reason).toBe("rate_limited");
+ expect(effectiveRate).toBeCloseTo(1 / 60, 5);
});
- it("returns retryAfterMs when rate limited", () => {
+ it("does not persist circuit state on first contact", () => {
const store = createRedisStore();
+ const now = 1_000_000;
- for (let i = 0; i < 10; i++) {
- runAdmit(store);
- }
+ runAdmit(store, { now, targetRateLimit: 10 });
- const { retryAfterMs } = runAdmit(store);
- expect(retryAfterMs).toBe(1000);
+ const epHash = store.get("ep:t1")!;
+ expect(epHash.has("is_open")).toBe(false);
+ expect(epHash.has("switched_at")).toBe(false);
});
- it("reports effective rate when rate limited", () => {
+ it("allows full rate after record-result closes the circuit", () => {
const store = createRedisStore();
+ const now = 1_000_000;
+
+ store.set(
+ "ep:t1",
+ new Map([
+ ["is_open", "0"],
+ ["switched_at", now.toString()],
+ ["bucket_tokens", "0"],
+ ["bucket_refilled_at", now.toString()],
+ ]),
+ );
- for (let i = 0; i < 10; i++) {
- runAdmit(store);
- }
+ const later = now + 60_000;
+ const { consumedTokens, reason } = runAdmit(store, {
+ now: later,
+ targetRateLimit: 10,
+ recoveryPeriodMs: 600_000,
+ });
- const { effectiveRate } = runAdmit(store);
- expect(effectiveRate).toBe(10);
+ expect(consumedTokens).toBeGreaterThanOrEqual(1);
+ expect(reason).toBe("allowed");
});
- it("refills tokens over time", () => {
+ it("allows a single request when bucket has tokens from refill", () => {
const store = createRedisStore();
const now = 1_000_000;
+ store.set(
+ "ep:t1",
+ new Map([
+ ["is_open", "0"],
+ ["bucket_tokens", "0"],
+ ["bucket_refilled_at", "0"],
+ ["switched_at", "0"],
+ ]),
+ );
- for (let i = 0; i < 10; i++) {
- runAdmit(store, { now });
- }
-
- const denied = runAdmit(store, { now });
- expect(denied.allowed).toBe(0);
+ const { consumedTokens, reason, retryAfterMs } = runAdmit(store, {
+ now,
+ targetRateLimit: 10,
+ });
- const refilled = runAdmit(store, { now: now + 1000 });
- expect(refilled.allowed).toBe(1);
+ expect(consumedTokens).toBe(1);
+ expect(reason).toBe("allowed");
+ expect(retryAfterMs).toBe(0);
});
- it("caps tokens at capacity", () => {
+ it("consumes up to targetBatchSize tokens", () => {
const store = createRedisStore();
const now = 1_000_000;
+ store.set(
+ "ep:t1",
+ new Map([
+ ["is_open", "0"],
+ ["bucket_tokens", "5"],
+ ["bucket_refilled_at", now.toString()],
+ ["switched_at", "0"],
+ ]),
+ );
- runAdmit(store, { now, capacity: 5, refillPerSec: 100 });
-
- // Advance 10 seconds — would add 1000 tokens without cap
- runAdmit(store, { now: now + 10_000, capacity: 5, refillPerSec: 100 });
-
- const rlHash = store.get("rl:t1")!;
- // Refill capped to capacity (5), then one consumed → 4
- expect(Number(rlHash.get("tokens"))).toBe(4);
+ const { consumedTokens } = runAdmit(store, {
+ now,
+ targetBatchSize: 3,
+ });
+ expect(consumedTokens).toBe(3);
});
- it("handles zero refill rate", () => {
+ it("consumes all available when batch exceeds available tokens", () => {
const store = createRedisStore();
+ const now = 1_000_000;
+ store.set(
+ "ep:t1",
+ new Map([
+ ["is_open", "0"],
+ ["bucket_tokens", "2"],
+ ["bucket_refilled_at", now.toString()],
+ ["switched_at", "0"],
+ ]),
+ );
- for (let i = 0; i < 10; i++) {
- runAdmit(store, { refillPerSec: 0 });
- }
-
- const { allowed, reason, retryAfterMs } = runAdmit(store, {
- refillPerSec: 0,
+ const { consumedTokens } = runAdmit(store, {
+ now,
+ targetBatchSize: 5,
});
- expect(allowed).toBe(0);
- expect(reason).toBe("rate_limited");
- expect(retryAfterMs).toBe(1000);
+ expect(consumedTokens).toBe(2);
});
- });
- describe("circuit breaker", () => {
- it("rejects when circuit is open", () => {
+ it("returns rate_limited when no tokens available", () => {
const store = createRedisStore();
const now = 1_000_000;
- const openedUntil = now + 60_000;
-
store.set(
- "cb:t1",
+ "ep:t1",
new Map([
- ["opened_until_ms", openedUntil.toString()],
- ["last_probe_ms", now.toString()],
+ ["is_open", "0"],
+ ["bucket_tokens", "0"],
+ ["bucket_refilled_at", now.toString()],
+ ["switched_at", "0"],
]),
);
- const { allowed, effectiveRate, reason } = runAdmit(store, { now });
- expect(allowed).toBe(0);
- expect(reason).toBe("circuit_open");
- expect(effectiveRate).toBe(0);
+ const { consumedTokens, reason, retryAfterMs } = runAdmit(store, { now });
+ expect(consumedTokens).toBe(0);
+ expect(reason).toBe("rate_limited");
+ expect(retryAfterMs).toBe(1000);
});
- it("returns retryAfterMs for open circuit", () => {
+ it("refills tokens over time", () => {
const store = createRedisStore();
const now = 1_000_000;
- const openedUntil = now + 30_000;
-
store.set(
- "cb:t1",
+ "ep:t1",
new Map([
- ["opened_until_ms", openedUntil.toString()],
- ["last_probe_ms", now.toString()],
+ ["is_open", "0"],
+ ["bucket_tokens", "0"],
+ ["bucket_refilled_at", now.toString()],
+ ["switched_at", "0"],
]),
);
- const { retryAfterMs } = runAdmit(store, { now });
- expect(retryAfterMs).toBe(30_000);
+ const { consumedTokens } = runAdmit(store, {
+ now: now + 1000,
+ targetRateLimit: 10,
+ });
+ expect(consumedTokens).toBe(1);
});
- it("allows probe when probe interval has elapsed", () => {
+ it("caps tokens at capacity", () => {
const store = createRedisStore();
const now = 1_000_000;
- const openedUntil = now + 120_000;
- const lastProbe = now - 61_000;
-
store.set(
- "cb:t1",
+ "ep:t1",
new Map([
- ["opened_until_ms", openedUntil.toString()],
- ["last_probe_ms", lastProbe.toString()],
+ ["is_open", "0"],
+ ["bucket_tokens", "0"],
+ ["bucket_refilled_at", "0"],
+ ["switched_at", "0"],
]),
);
- const { allowed, effectiveRate, reason, retryAfterMs } = runAdmit(store, {
+ const { consumedTokens } = runAdmit(store, {
now,
- cbProbeIntervalMs: 60_000,
+ capacity: 5,
+ targetRateLimit: 100,
+ targetBatchSize: 10,
});
- expect(allowed).toBe(1);
- expect(reason).toBe("probe");
- expect(retryAfterMs).toBe(0);
- expect(effectiveRate).toBe(0);
+ expect(consumedTokens).toBe(5);
});
- it("updates last_probe_ms after allowing a probe", () => {
+ it("handles zero refill rate", () => {
const store = createRedisStore();
const now = 1_000_000;
- const openedUntil = now + 120_000;
- const lastProbe = now - 61_000;
-
store.set(
- "cb:t1",
+ "ep:t1",
new Map([
- ["opened_until_ms", openedUntil.toString()],
- ["last_probe_ms", lastProbe.toString()],
+ ["is_open", "0"],
+ ["bucket_tokens", "0"],
+ ["bucket_refilled_at", now.toString()],
+ ["switched_at", "0"],
]),
);
- runAdmit(store, { now, cbProbeIntervalMs: 60_000 });
-
- const cbHash = store.get("cb:t1")!;
- expect(cbHash.get("last_probe_ms")).toBe(now.toString());
+ const { consumedTokens, reason } = runAdmit(store, {
+ now: now + 10_000,
+ targetRateLimit: 0,
+ });
+ expect(consumedTokens).toBe(0);
+ expect(reason).toBe("rate_limited");
});
- it("does not probe when interval has not elapsed", () => {
+ it("preserves fractional refill time (bucketRefilledAt += generationTime, not now)", () => {
const store = createRedisStore();
const now = 1_000_000;
- const openedUntil = now + 120_000;
- const lastProbe = now - 30_000;
-
store.set(
- "cb:t1",
+ "ep:t1",
new Map([
- ["opened_until_ms", openedUntil.toString()],
- ["last_probe_ms", lastProbe.toString()],
+ ["is_open", "0"],
+ ["bucket_tokens", "0"],
+ ["bucket_refilled_at", (now - 150).toString()],
+ ["switched_at", "0"],
]),
);
- const { allowed, reason } = runAdmit(store, {
- now,
- cbProbeIntervalMs: 60_000,
- });
- expect(allowed).toBe(0);
- expect(reason).toBe("circuit_open");
+ runAdmit(store, { now, targetRateLimit: 10 });
+
+ const epHash = store.get("ep:t1")!;
+ const refilledAt = Number(epHash.get("bucket_refilled_at"));
+ // 1 token generated at rate 10/s takes 100ms, so refilledAt = (now-150) + 100 = now - 50
+ expect(refilledAt).toBe(now - 50);
});
+ });
- it("does not probe when cbProbeIntervalMs is 0", () => {
+ describe("circuit breaker states", () => {
+ it("blocks completely when circuit is open during cooldown", () => {
const store = createRedisStore();
const now = 1_000_000;
- const openedUntil = now + 120_000;
+ const switchedAt = now - 10_000;
store.set(
- "cb:t1",
+ "ep:t1",
new Map([
- ["opened_until_ms", openedUntil.toString()],
- ["last_probe_ms", "0"],
+ ["is_open", "1"],
+ ["switched_at", switchedAt.toString()],
+ ["bucket_tokens", "100"],
]),
);
- const { allowed, reason } = runAdmit(store, {
+ const { consumedTokens, reason } = runAdmit(store, {
now,
- cbProbeIntervalMs: 0,
+ cooldownMs: 120_000,
});
- expect(allowed).toBe(0);
+ expect(consumedTokens).toBe(0);
expect(reason).toBe("circuit_open");
});
- });
- describe("sliding window", () => {
- it("initialises cbWindowFrom on first call", () => {
+ it("does not consume bucket tokens when fully open", () => {
const store = createRedisStore();
const now = 1_000_000;
-
- runAdmit(store, { now });
-
- const cbHash = store.get("cb:t1")!;
- expect(cbHash.get("cb_window_from")).toBe(now.toString());
- });
-
- it("rolls current window to previous when period expires", () => {
- const store = createRedisStore();
- const cbWindowPeriodMs = 60_000;
- const t0 = 1_000_000;
- const t1 = t0 + cbWindowPeriodMs + 1;
+ const switchedAt = now - 10_000;
store.set(
- "cb:t1",
+ "ep:t1",
new Map([
- ["cb_window_from", t0.toString()],
- ["cb_failures", "5"],
- ["cb_attempts", "10"],
- ["cb_prev_failures", "0"],
- ["cb_prev_attempts", "0"],
+ ["is_open", "1"],
+ ["switched_at", switchedAt.toString()],
+ ["bucket_tokens", "100"],
+ ["bucket_refilled_at", now.toString()],
]),
);
- runAdmit(store, { now: t1, cbWindowPeriodMs });
+ runAdmit(store, { now, cooldownMs: 120_000 });
- const cbHash = store.get("cb:t1")!;
- expect(cbHash.get("cb_prev_failures")).toBe("5");
- expect(cbHash.get("cb_prev_attempts")).toBe("10");
- expect(cbHash.get("cb_failures")).toBe("0");
- expect(cbHash.get("cb_attempts")).toBe("0");
- expect(cbHash.get("cb_window_from")).toBe(t1.toString());
+ const epHash = store.get("ep:t1")!;
+ expect(Number(epHash.get("bucket_tokens"))).toBe(100);
});
- it("clears both windows when gap exceeds two periods", () => {
+ it("returns retryAfterMs for open circuit", () => {
const store = createRedisStore();
- const cbWindowPeriodMs = 60_000;
- const t0 = 1_000_000;
- const t1 = t0 + 2 * cbWindowPeriodMs + 1;
+ const now = 1_000_000;
+ const switchedAt = now - 10_000;
store.set(
- "cb:t1",
+ "ep:t1",
new Map([
- ["cb_window_from", t0.toString()],
- ["cb_failures", "5"],
- ["cb_attempts", "10"],
- ["cb_prev_failures", "3"],
- ["cb_prev_attempts", "7"],
+ ["is_open", "1"],
+ ["switched_at", switchedAt.toString()],
]),
);
- runAdmit(store, { now: t1, cbWindowPeriodMs });
-
- const cbHash = store.get("cb:t1")!;
- expect(cbHash.get("cb_prev_failures")).toBe("0");
- expect(cbHash.get("cb_prev_attempts")).toBe("0");
- expect(cbHash.get("cb_failures")).toBe("0");
- expect(cbHash.get("cb_attempts")).toBe("0");
- expect(cbHash.get("cb_window_from")).toBe(t1.toString());
+ const { retryAfterMs } = runAdmit(store, { now, cooldownMs: 120_000 });
+ expect(retryAfterMs).toBe(110_000);
});
- });
- describe("decay scaling", () => {
- it("applies reduced rate during decay period", () => {
+ it("uses probeRateLimit when half-open (after cooldown)", () => {
const store = createRedisStore();
- const closedAt = 1_000_000;
- const decayPeriodMs = 300_000;
- const halfwayThrough = closedAt + decayPeriodMs / 2;
+ const now = 1_000_000;
+ const switchedAt = now - 130_000;
- store.set("cb:t1", new Map([["opened_until_ms", closedAt.toString()]]));
+ store.set(
+ "ep:t1",
+ new Map([
+ ["is_open", "1"],
+ ["switched_at", switchedAt.toString()],
+ ["bucket_tokens", "0"],
+ ["bucket_refilled_at", (now - 60_000).toString()],
+ ]),
+ );
const { effectiveRate } = runAdmit(store, {
- now: halfwayThrough,
- refillPerSec: 10,
- decayPeriodMs,
+ now,
+ cooldownMs: 120_000,
+ probeRateLimit: 1 / 60,
});
- expect(effectiveRate).toBe(5);
+ expect(effectiveRate).toBeCloseTo(1 / 60, 5);
});
- it("uses full rate after decay period ends", () => {
+ it("zeroes residual bucket tokens when circuit is half-open", () => {
const store = createRedisStore();
- const closedAt = 1_000_000;
- const decayPeriodMs = 300_000;
- const afterDecay = closedAt + decayPeriodMs + 1;
+ const now = 1_000_000;
+ const switchedAt = now - 130_000;
- store.set("cb:t1", new Map([["opened_until_ms", closedAt.toString()]]));
+ store.set(
+ "ep:t1",
+ new Map([
+ ["is_open", "1"],
+ ["switched_at", switchedAt.toString()],
+ ["bucket_tokens", "100"],
+ ["bucket_refilled_at", (now - 60_000).toString()],
+ ]),
+ );
- const { allowed, effectiveRate } = runAdmit(store, {
- now: afterDecay,
- refillPerSec: 10,
- decayPeriodMs,
+ const { consumedTokens } = runAdmit(store, {
+ now,
+ cooldownMs: 120_000,
+ probeRateLimit: 1 / 60,
});
- expect(allowed).toBe(1);
- expect(effectiveRate).toBe(10);
+
+ expect(consumedTokens).toBe(1);
+ const epHash = store.get("ep:t1")!;
+ expect(Number(epHash.get("bucket_tokens"))).toBe(0);
});
- it("clamps minimum effective rate to 1", () => {
+ it("uses recovery ramp when closed during recovery period", () => {
const store = createRedisStore();
- const closedAt = 1_000_000;
- const decayPeriodMs = 300_000;
- const veryEarly = closedAt + 1;
+ const switchedAt = 1_000_000;
+ const recoveryPeriodMs = 600_000;
+ const now = switchedAt + recoveryPeriodMs / 2;
- store.set("cb:t1", new Map([["opened_until_ms", closedAt.toString()]]));
+ store.set(
+ "ep:t1",
+ new Map([
+ ["is_open", "0"],
+ ["switched_at", switchedAt.toString()],
+ ["bucket_tokens", "0"],
+ ["bucket_refilled_at", "0"],
+ ]),
+ );
const { effectiveRate } = runAdmit(store, {
- now: veryEarly,
- refillPerSec: 10,
- decayPeriodMs,
+ now,
+ targetRateLimit: 10,
+ recoveryPeriodMs,
});
- expect(effectiveRate).toBeGreaterThanOrEqual(1);
+ const probeRate = defaultArgs.probeRateLimit;
+ const expectedRate = probeRate + 0.5 * (10 - probeRate);
+ expect(effectiveRate).toBeCloseTo(expectedRate, 5);
});
- it("clears openedUntil when decay period fully elapses", () => {
+ it("uses full rate when closed and past recovery period", () => {
const store = createRedisStore();
- const closedAt = 1_000_000;
- const decayPeriodMs = 300_000;
- const afterDecay = closedAt + decayPeriodMs + 1;
-
- store.set("cb:t1", new Map([["opened_until_ms", closedAt.toString()]]));
+ const switchedAt = 100_000;
+ const recoveryPeriodMs = 600_000;
+ const now = switchedAt + recoveryPeriodMs + 1;
- runAdmit(store, { now: afterDecay, decayPeriodMs });
+ store.set(
+ "ep:t1",
+ new Map([
+ ["is_open", "0"],
+ ["switched_at", switchedAt.toString()],
+ ["bucket_tokens", "0"],
+ ["bucket_refilled_at", "0"],
+ ]),
+ );
- const cbHash = store.get("cb:t1")!;
- expect(cbHash.get("opened_until_ms")).toBe("0");
+ const { effectiveRate } = runAdmit(store, {
+ now,
+ targetRateLimit: 10,
+ recoveryPeriodMs,
+ });
+ expect(effectiveRate).toBe(10);
});
+ });
- it("does not decay when decayPeriodMs is 0", () => {
+ describe("state persistence", () => {
+ it("persists bucket_tokens and bucket_refilled_at", () => {
const store = createRedisStore();
- const closedAt = 1_000_000;
+ const now = 1_000_000;
+ store.set(
+ "ep:t1",
+ new Map([
+ ["is_open", "0"],
+ ["bucket_tokens", "5"],
+ ["bucket_refilled_at", now.toString()],
+ ["switched_at", "0"],
+ ]),
+ );
- store.set("cb:t1", new Map([["opened_until_ms", closedAt.toString()]]));
+ runAdmit(store, { now, targetBatchSize: 2 });
- const { allowed, effectiveRate } = runAdmit(store, {
- now: closedAt + 1,
- refillPerSec: 10,
- decayPeriodMs: 0,
- });
- expect(allowed).toBe(1);
- expect(effectiveRate).toBe(10);
+ const epHash = store.get("ep:t1")!;
+ expect(Number(epHash.get("bucket_tokens"))).toBe(3);
});
- });
- describe("state persistence", () => {
- it("persists token count and last_refill_ms", () => {
+ it("does not write any fields when circuit_open early return", () => {
const store = createRedisStore();
- runAdmit(store, { now: 1_000_000, capacity: 5 });
+ runAdmit(store, {
+ now: 10_000,
+ });
- const rlHash = store.get("rl:t1")!;
- expect(rlHash.get("tokens")).toBeDefined();
- expect(rlHash.get("last_refill_ms")).toBe("1000000");
+ expect(store.has("ep:t1")).toBe(false);
});
- it("persists circuit breaker fields", () => {
+ it("does not write sampling or circuit fields on half-open path", () => {
const store = createRedisStore();
- runAdmit(store, { now: 1_000_000 });
-
- const cbHash = store.get("cb:t1")!;
- expect(cbHash.has("opened_until_ms")).toBe(true);
- expect(cbHash.has("cb_window_from")).toBe(true);
- expect(cbHash.has("cb_failures")).toBe(true);
- expect(cbHash.has("cb_attempts")).toBe(true);
- expect(cbHash.has("cb_prev_failures")).toBe(true);
- expect(cbHash.has("cb_prev_attempts")).toBe(true);
+ runAdmit(store, {
+ now: 200_000,
+ });
+
+ const epHash = store.get("ep:t1")!;
+ expect(epHash.has("bucket_tokens")).toBe(true);
+ expect(epHash.has("bucket_refilled_at")).toBe(true);
+ expect(epHash.has("cur_attempts")).toBe(false);
+ expect(epHash.has("cur_failures")).toBe(false);
+ expect(epHash.has("sample_till")).toBe(false);
+ expect(epHash.has("is_open")).toBe(false);
+ expect(epHash.has("switched_at")).toBe(false);
});
it("isolates state between targets", () => {
const store = createRedisStore();
- runAdmit(store, {}, "target-a");
- runAdmit(store, {}, "target-b");
+ store.set(
+ "ep:target-a",
+ new Map([
+ ["is_open", "0"],
+ ["bucket_tokens", "5"],
+ ["bucket_refilled_at", "10000"],
+ ]),
+ );
+ store.set(
+ "ep:target-b",
+ new Map([
+ ["is_open", "0"],
+ ["bucket_tokens", "3"],
+ ["bucket_refilled_at", "10000"],
+ ]),
+ );
+
+ runAdmit(store, { now: 10_000 }, "target-a");
+ runAdmit(store, { now: 10_000 }, "target-b");
- expect(store.has("cb:target-a")).toBe(true);
- expect(store.has("cb:target-b")).toBe(true);
- expect(store.has("rl:target-a")).toBe(true);
- expect(store.has("rl:target-b")).toBe(true);
+ expect(store.has("ep:target-a")).toBe(true);
+ expect(store.has("ep:target-b")).toBe(true);
});
});
});
diff --git a/lambdas/https-client-lambda/src/__tests__/endpoint-gate.test.ts b/lambdas/https-client-lambda/src/__tests__/endpoint-gate.test.ts
index efbc6d88..eea9d44d 100644
--- a/lambdas/https-client-lambda/src/__tests__/endpoint-gate.test.ts
+++ b/lambdas/https-client-lambda/src/__tests__/endpoint-gate.test.ts
@@ -11,13 +11,13 @@ const mockDisconnect = jest.fn().mockResolvedValue(undefined);
const mockOn = jest.fn();
const defaultConfig: EndpointGateConfig = {
- burstCapacity: 10,
- cbProbeIntervalMs: 60_000,
- decayPeriodMs: 300_000,
- cbWindowPeriodMs: 60_000,
- cbErrorThreshold: 0.5,
- cbMinAttempts: 10,
- cbCooldownMs: 60_000,
+ burstCapacity: 2250,
+ probeRateLimit: 1 / 60,
+ recoveryPeriodMs: 600_000,
+ samplePeriodMs: 300_000,
+ failureThreshold: 0.3,
+ minAttempts: 5,
+ cooldownPeriodMs: 120_000,
};
const mockRedis = {
@@ -34,12 +34,23 @@ beforeEach(() => {
});
describe("admit", () => {
- it("returns allowed when tokens available", async () => {
- mockSendCommand.mockResolvedValueOnce([1, "allowed", 0, 10]);
+ it("returns allowed with consumedTokens when tokens available", async () => {
+ mockSendCommand.mockResolvedValueOnce([5, "allowed", 0, 10]);
- const result = await admit(mockRedis, "target-1", 10, true, defaultConfig);
+ const result = await admit(
+ mockRedis,
+ "target-1",
+ 10,
+ true,
+ 5,
+ defaultConfig,
+ );
- expect(result).toEqual({ allowed: true, probe: false, effectiveRate: 10 });
+ expect(result).toEqual({
+ allowed: true,
+ consumedTokens: 5,
+ effectiveRate: 10,
+ });
expect(mockSendCommand).toHaveBeenCalledWith(
expect.arrayContaining(["EVALSHA"]),
);
@@ -48,7 +59,14 @@ describe("admit", () => {
it("returns rate_limited when tokens exhausted", async () => {
mockSendCommand.mockResolvedValueOnce([0, "rate_limited", 1000, 10]);
- const result = await admit(mockRedis, "target-1", 10, false, defaultConfig);
+ const result = await admit(
+ mockRedis,
+ "target-1",
+ 10,
+ false,
+ 5,
+ defaultConfig,
+ );
expect(result).toEqual({
allowed: false,
@@ -58,18 +76,17 @@ describe("admit", () => {
});
});
- it("returns allowed with probe flag when circuit is open but probe slot is available", async () => {
- mockSendCommand.mockResolvedValueOnce([1, "probe", 0, 0]);
-
- const result = await admit(mockRedis, "target-1", 10, true, defaultConfig);
-
- expect(result).toEqual({ allowed: true, probe: true, effectiveRate: 0 });
- });
-
- it("returns circuit_open without probe slot", async () => {
+ it("returns circuit_open when circuit is fully open", async () => {
mockSendCommand.mockResolvedValueOnce([0, "circuit_open", 30_000, 0]);
- const result = await admit(mockRedis, "target-1", 10, true, defaultConfig);
+ const result = await admit(
+ mockRedis,
+ "target-1",
+ 10,
+ true,
+ 5,
+ defaultConfig,
+ );
expect(result).toEqual({
allowed: false,
@@ -84,9 +101,20 @@ describe("admit", () => {
.mockRejectedValueOnce(new Error("NOSCRIPT No matching script"))
.mockResolvedValueOnce([1, "allowed", 0, 10]);
- const result = await admit(mockRedis, "target-1", 10, true, defaultConfig);
+ const result = await admit(
+ mockRedis,
+ "target-1",
+ 10,
+ true,
+ 1,
+ defaultConfig,
+ );
- expect(result).toEqual({ allowed: true, probe: false, effectiveRate: 10 });
+ expect(result).toEqual({
+ allowed: true,
+ consumedTokens: 1,
+ effectiveRate: 10,
+ });
expect(mockSendCommand).toHaveBeenCalledTimes(2);
expect(mockSendCommand).toHaveBeenNthCalledWith(
1,
@@ -98,25 +126,33 @@ describe("admit", () => {
);
});
- it("passes cbProbeIntervalMs=0 when circuit breaker is disabled", async () => {
+ it("passes probeRateLimit=0 when circuit breaker is disabled", async () => {
mockSendCommand.mockResolvedValueOnce([1, "allowed", 0, 10]);
- await admit(mockRedis, "target-1", 10, false, defaultConfig);
+ await admit(mockRedis, "target-1", 10, false, 1, defaultConfig);
- // EVALSHA layout: [EVALSHA, sha, keyCount, cbKey, rlKey, now, capacity, refillPerSec, cooldownMs, decayPeriodMs, cbWindowPeriodMs, cbProbeIntervalMs]
const args = mockSendCommand.mock.calls[0]![0] as string[];
- const cbProbeIntervalArg = args[11];
- expect(cbProbeIntervalArg).toBe("0");
+ const probeRateArg = args[9];
+ expect(probeRateArg).toBe("0");
});
- it("passes cbKey first, rlKey second", async () => {
+ it("passes single epKey", async () => {
mockSendCommand.mockResolvedValueOnce([1, "allowed", 0, 5]);
- await admit(mockRedis, "my-target", 5, true, defaultConfig);
+ await admit(mockRedis, "my-target", 5, true, 1, defaultConfig);
+
+ const args = mockSendCommand.mock.calls[0]![0] as string[];
+ expect(args[3]).toBe("ep:{my-target}");
+ });
+
+ it("passes targetBatchSize as ARGV", async () => {
+ mockSendCommand.mockResolvedValueOnce([3, "allowed", 0, 10]);
+
+ await admit(mockRedis, "target-1", 10, true, 7, defaultConfig);
const args = mockSendCommand.mock.calls[0]![0] as string[];
- expect(args[3]).toBe("cb:{my-target}");
- expect(args[4]).toBe("rl:{my-target}");
+ const batchSizeArg = args[10];
+ expect(batchSizeArg).toBe("7");
});
});
@@ -130,6 +166,7 @@ describe("evalScript", () => {
"target-1",
10,
true,
+ 1,
defaultConfig,
).catch((error: unknown) => error);
@@ -149,6 +186,7 @@ describe("evalScript", () => {
"target-1",
10,
true,
+ 1,
defaultConfig,
).catch((error: unknown) => error);
@@ -159,70 +197,101 @@ describe("evalScript", () => {
});
describe("recordResult", () => {
- it("returns closed on success below threshold", async () => {
- mockSendCommand.mockResolvedValueOnce([1, "closed"]);
+ it("returns closed state when circuit is steady-state", async () => {
+ mockSendCommand.mockResolvedValueOnce(["closed", 0]);
const result = await recordResult(
mockRedis,
"target-1",
- true,
+ 5,
+ 0,
defaultConfig,
);
- expect(result).toEqual({ ok: true, state: "closed" });
+ expect(result).toEqual({ circuitState: "closed", stateChanged: false });
expect(mockSendCommand).toHaveBeenCalledWith(
expect.arrayContaining(["EVALSHA"]),
);
});
- it("returns opened when failure crosses threshold", async () => {
- mockSendCommand.mockResolvedValueOnce([0, "opened"]);
+ it("returns open with stateChanged when failure crosses threshold", async () => {
+ mockSendCommand.mockResolvedValueOnce(["open", 1]);
const result = await recordResult(
mockRedis,
"target-1",
- false,
+ 5,
+ 5,
defaultConfig,
);
- expect(result).toEqual({ ok: false, state: "opened" });
+ expect(result).toEqual({ circuitState: "open", stateChanged: true });
});
- it("returns failed when failure is below threshold", async () => {
- mockSendCommand.mockResolvedValueOnce([0, "failed"]);
+ it("returns closed_recovery with stateChanged when circuit closes", async () => {
+ mockSendCommand.mockResolvedValueOnce(["closed_recovery", 1]);
const result = await recordResult(
mockRedis,
"target-1",
- false,
+ 5,
+ 0,
+ defaultConfig,
+ );
+
+ expect(result).toEqual({
+ circuitState: "closed_recovery",
+ stateChanged: true,
+ });
+ });
+
+ it("returns half_open without stateChanged when probing", async () => {
+ mockSendCommand.mockResolvedValueOnce(["half_open", 0]);
+
+ const result = await recordResult(
+ mockRedis,
+ "target-1",
+ 5,
+ 1,
defaultConfig,
);
- expect(result).toEqual({ ok: false, state: "failed" });
+ expect(result).toEqual({ circuitState: "half_open", stateChanged: false });
});
it("falls back to EVAL on NOSCRIPT error", async () => {
mockSendCommand
.mockRejectedValueOnce(new Error("NOSCRIPT No matching script"))
- .mockResolvedValueOnce([1, "closed"]);
+ .mockResolvedValueOnce(["closed", 0]);
const result = await recordResult(
mockRedis,
"target-1",
- true,
+ 1,
+ 0,
defaultConfig,
);
- expect(result).toEqual({ ok: true, state: "closed" });
+ expect(result).toEqual({ circuitState: "closed", stateChanged: false });
expect(mockSendCommand).toHaveBeenCalledTimes(2);
});
- it("passes correct cb key for target", async () => {
- mockSendCommand.mockResolvedValueOnce([1, "closed"]);
+ it("passes correct ep key for target", async () => {
+ mockSendCommand.mockResolvedValueOnce(["closed", 0]);
+
+ await recordResult(mockRedis, "my-target", 1, 0, defaultConfig);
+
+ const args = mockSendCommand.mock.calls[0]![0] as string[];
+ expect(args[3]).toBe("ep:{my-target}");
+ });
+
+ it("passes consumedTokens and processingFailures as ARGV", async () => {
+ mockSendCommand.mockResolvedValueOnce(["closed", 0]);
- await recordResult(mockRedis, "my-target", true, defaultConfig);
+ await recordResult(mockRedis, "target-1", 8, 3, defaultConfig);
const args = mockSendCommand.mock.calls[0]![0] as string[];
- expect(args[3]).toBe("cb:{my-target}");
+ expect(args[5]).toBe("8");
+ expect(args[6]).toBe("3");
});
});
diff --git a/lambdas/https-client-lambda/src/__tests__/handler.test.ts b/lambdas/https-client-lambda/src/__tests__/handler.test.ts
index 3b8ad521..ab5dff46 100644
--- a/lambdas/https-client-lambda/src/__tests__/handler.test.ts
+++ b/lambdas/https-client-lambda/src/__tests__/handler.test.ts
@@ -3,7 +3,6 @@ import {
DEFAULT_TARGET,
makeRecord,
} from "__tests__/fixtures/handler-fixtures";
-import { VisibilityManagedError } from "services/visibility-managed-error";
jest.mock("@nhs-notify-client-callbacks/logger", () => ({
logger: {
@@ -74,17 +73,20 @@ jest.mock("services/redis-client", () => ({
getRedisClient: (...args: unknown[]) => mockGetRedisClient(...args),
}));
+jest.mock("services/delivery-observability", () => ({
+ recordAdmissionDenied: jest.fn(),
+ recordCircuitBreakerClosed: jest.fn(),
+ recordCircuitBreakerOpen: jest.fn(),
+ recordDeliveryAttempt: jest.fn(),
+ recordDeliveryDuration: jest.fn(),
+ recordDeliveryFailure: jest.fn(),
+ recordDeliveryPermanentFailure: jest.fn(),
+ recordDeliveryRateLimited: jest.fn(),
+ recordDeliverySuccess: jest.fn(),
+ recordRetryWindowExhausted: jest.fn(),
+}));
+
jest.mock("services/delivery-metrics", () => ({
- emitAdmissionDenied: jest.fn(),
- emitCircuitBreakerClosed: jest.fn(),
- emitCircuitBreakerOpen: jest.fn(),
- emitDeliveryAttempt: jest.fn(),
- emitDeliveryDuration: jest.fn(),
- emitDeliveryFailure: jest.fn(),
- emitDeliveryPermanentFailure: jest.fn(),
- emitDeliverySuccess: jest.fn(),
- emitRateLimited: jest.fn(),
- emitRetryWindowExhausted: jest.fn(),
flushMetrics: jest.fn().mockResolvedValue(undefined),
resetMetrics: jest.fn(),
}));
@@ -106,15 +108,15 @@ describe("processRecords", () => {
mockJitteredBackoff.mockReturnValue(5);
mockIsWindowExhausted.mockReturnValue(false);
mockHandleRateLimitedRecord.mockRejectedValue(
- new VisibilityManagedError("Rate limited — requeue"),
+ new Error("Rate limited — requeue"),
);
mockGetRedisClient.mockResolvedValue({});
mockAdmit.mockResolvedValue({
allowed: true,
- probe: false,
+ consumedTokens: 100,
effectiveRate: 10,
});
- mockRecordResult.mockResolvedValue({ ok: true, state: "closed" });
+ mockRecordResult.mockResolvedValue({ ok: true, state: "ok" });
});
it("returns no failures on successful delivery", async () => {
@@ -159,7 +161,7 @@ describe("processRecords", () => {
expect(failures).toEqual([{ itemIdentifier: "msg-1" }]);
});
- it("returns failure for 429 rate-limited responses", async () => {
+ it("returns failure for 429 when handleRateLimitedRecord rejects", async () => {
mockDeliverPayload.mockResolvedValue({
outcome: "rate_limited",
retryAfterHeader: "60",
@@ -177,7 +179,7 @@ describe("processRecords", () => {
);
});
- it("processes multiple records independently", async () => {
+ it("processes multiple records in a single target batch", async () => {
const record1 = makeRecord({ messageId: "msg-1" });
const record2 = makeRecord({ messageId: "msg-2" });
@@ -191,25 +193,45 @@ describe("processRecords", () => {
const failures = await processRecords([record1, record2]);
expect(failures).toEqual([{ itemIdentifier: "msg-2" }]);
+ expect(mockAdmit).toHaveBeenCalledTimes(1);
});
- it("an unexpected error on one record does not prevent subsequent records being processed", async () => {
+ it("delivers only admitted records when consumedTokens is less than batch size", async () => {
const record1 = makeRecord({ messageId: "msg-1" });
const record2 = makeRecord({ messageId: "msg-2" });
+ const record3 = makeRecord({ messageId: "msg-3" });
+
+ mockAdmit.mockResolvedValue({
+ allowed: true,
+ consumedTokens: 1,
+ effectiveRate: 10,
+ });
+
+ const failures = await processRecords([record1, record2, record3]);
- mockLoadTargetConfig
- .mockRejectedValueOnce(new Error("S3 unavailable"))
- .mockResolvedValueOnce(DEFAULT_TARGET);
+ expect(mockDeliverPayload).toHaveBeenCalledTimes(1);
+ expect(failures).toEqual([
+ { itemIdentifier: "msg-2" },
+ { itemIdentifier: "msg-3" },
+ ]);
+ });
+
+ it("an unexpected delivery error does not prevent other records in the batch", async () => {
+ const record1 = makeRecord({ messageId: "msg-1" });
+ const record2 = makeRecord({ messageId: "msg-2" });
+
+ mockDeliverPayload
+ .mockRejectedValueOnce(new Error("Connection reset"))
+ .mockResolvedValueOnce({ outcome: "success" });
const failures = await processRecords([record1, record2]);
expect(failures).toEqual([{ itemIdentifier: "msg-1" }]);
- expect(mockDeliverPayload).toHaveBeenCalledTimes(1);
expect(mockChangeVisibility).toHaveBeenCalledWith("receipt-1", 5);
});
it("applies jittered backoff cooldown on unexpected errors", async () => {
- mockLoadTargetConfig.mockRejectedValue(new Error("Infrastructure error"));
+ mockDeliverPayload.mockRejectedValue(new Error("Infrastructure error"));
const failures = await processRecords([makeRecord()]);
@@ -217,7 +239,7 @@ describe("processRecords", () => {
expect(mockChangeVisibility).toHaveBeenCalledWith("receipt-1", 5);
});
- it("does not apply a second visibility change for admission-denied (managed path)", async () => {
+ it("changes visibility once per record for admission-denied batch", async () => {
mockAdmit.mockResolvedValue({
allowed: false,
reason: "rate_limited",
@@ -230,7 +252,7 @@ describe("processRecords", () => {
expect(mockChangeVisibility).toHaveBeenCalledTimes(1);
});
- it("does not apply a second visibility change for transient failure (managed path)", async () => {
+ it("changes visibility once for transient failure", async () => {
mockDeliverPayload.mockResolvedValue({
outcome: "transient_failure",
statusCode: 503,
@@ -241,13 +263,13 @@ describe("processRecords", () => {
expect(mockChangeVisibility).toHaveBeenCalledTimes(1);
});
- it("returns failure when CLIENT_ID is not set", async () => {
+ it("throws when CLIENT_ID is not set", async () => {
const saved = process.env.CLIENT_ID;
delete process.env.CLIENT_ID;
- const failures = await processRecords([makeRecord()]);
-
- expect(failures).toEqual([{ itemIdentifier: "msg-1" }]);
+ await expect(processRecords([makeRecord()])).rejects.toThrow(
+ "CLIENT_ID is required",
+ );
process.env.CLIENT_ID = saved;
});
@@ -262,7 +284,7 @@ describe("processRecords", () => {
expect(mockDeliverPayload).not.toHaveBeenCalled();
});
- it("calls changeVisibility with backoff on 5xx then throws", async () => {
+ it("calls changeVisibility with backoff on 5xx", async () => {
mockDeliverPayload.mockResolvedValue({
outcome: "transient_failure",
statusCode: 503,
@@ -303,7 +325,7 @@ describe("processRecords", () => {
expect(failures).toEqual([]);
});
- it("requeues when rate limited by endpoint gate", async () => {
+ it("requeues all records when rate limited by endpoint gate", async () => {
mockAdmit.mockResolvedValue({
allowed: false,
reason: "rate_limited",
@@ -314,12 +336,14 @@ describe("processRecords", () => {
const failures = await processRecords([makeRecord()]);
expect(failures).toEqual([{ itemIdentifier: "msg-1" }]);
- expect(mockChangeVisibility).toHaveBeenCalledWith("receipt-1", 2);
+ const visibilityDelay = mockChangeVisibility.mock.calls[0]![1] as number;
+ expect(visibilityDelay).toBeGreaterThanOrEqual(2);
+ expect(visibilityDelay).toBeLessThanOrEqual(6);
expect(mockSendToDlq).not.toHaveBeenCalled();
expect(mockDeliverPayload).not.toHaveBeenCalled();
});
- it("requeues when circuit is open", async () => {
+ it("requeues all records when circuit is open", async () => {
mockAdmit.mockResolvedValue({
allowed: false,
reason: "circuit_open",
@@ -330,7 +354,9 @@ describe("processRecords", () => {
const failures = await processRecords([makeRecord()]);
expect(failures).toEqual([{ itemIdentifier: "msg-1" }]);
- expect(mockChangeVisibility).toHaveBeenCalledWith("receipt-1", 30);
+ const visibilityDelay = mockChangeVisibility.mock.calls[0]![1] as number;
+ expect(visibilityDelay).toBeGreaterThanOrEqual(30);
+ expect(visibilityDelay).toBeLessThanOrEqual(34);
expect(mockSendToDlq).not.toHaveBeenCalled();
expect(mockDeliverPayload).not.toHaveBeenCalled();
});
@@ -350,17 +376,23 @@ describe("processRecords", () => {
"target-1",
10,
false,
+ 1,
expect.any(Object),
);
expect(mockDeliverPayload).toHaveBeenCalled();
});
- it("calls recordResult(true) on successful delivery when CB enabled", async () => {
+ it("calls recordResult with batch counts on successful delivery when CB enabled", async () => {
const targetCb = {
...DEFAULT_TARGET,
delivery: { circuitBreaker: { enabled: true } },
};
mockLoadTargetConfig.mockResolvedValue(targetCb);
+ mockAdmit.mockResolvedValue({
+ allowed: true,
+ consumedTokens: 1,
+ effectiveRate: 10,
+ });
const failures = await processRecords([makeRecord()]);
@@ -368,17 +400,23 @@ describe("processRecords", () => {
expect(mockRecordResult).toHaveBeenCalledWith(
expect.anything(),
"target-1",
- true,
+ 1,
+ 0,
expect.any(Object),
);
});
- it("calls recordResult(false) on 5xx before visibility change", async () => {
+ it("calls recordResult with failure count on 5xx when CB enabled", async () => {
const targetCb = {
...DEFAULT_TARGET,
delivery: { circuitBreaker: { enabled: true } },
};
mockLoadTargetConfig.mockResolvedValue(targetCb);
+ mockAdmit.mockResolvedValue({
+ allowed: true,
+ consumedTokens: 1,
+ effectiveRate: 10,
+ });
mockDeliverPayload.mockResolvedValue({
outcome: "transient_failure",
statusCode: 503,
@@ -390,13 +428,14 @@ describe("processRecords", () => {
expect(mockRecordResult).toHaveBeenCalledWith(
expect.anything(),
"target-1",
- false,
+ 1,
+ 1,
expect.any(Object),
);
expect(mockChangeVisibility).toHaveBeenCalled();
});
- it("does not call recordResult on rate-limited path", async () => {
+ it("does not call recordResult on gate admission-denied path", async () => {
mockAdmit.mockResolvedValue({
allowed: false,
reason: "rate_limited",
@@ -409,17 +448,6 @@ describe("processRecords", () => {
expect(mockRecordResult).not.toHaveBeenCalled();
});
- it("does not call recordResult on 429 path", async () => {
- mockDeliverPayload.mockResolvedValue({
- outcome: "rate_limited",
- retryAfterHeader: "60",
- });
-
- await processRecords([makeRecord()]);
-
- expect(mockRecordResult).not.toHaveBeenCalled();
- });
-
it("does not call recordResult when CB is disabled on transient failure", async () => {
const targetNoCb = {
...DEFAULT_TARGET,
@@ -449,7 +477,7 @@ describe("processRecords", () => {
expect(mockRecordResult).not.toHaveBeenCalled();
});
- it("emits CircuitBreakerOpen metric when recordResult returns opened", async () => {
+ it("records CircuitBreakerOpen when recordResult indicates circuit opened", async () => {
const targetCb = {
...DEFAULT_TARGET,
delivery: { circuitBreaker: { enabled: true } },
@@ -459,18 +487,21 @@ describe("processRecords", () => {
outcome: "transient_failure",
statusCode: 503,
});
- mockRecordResult.mockResolvedValue({ ok: false, state: "opened" });
+ mockRecordResult.mockResolvedValue({
+ circuitState: "open",
+ stateChanged: true,
+ });
- const { emitCircuitBreakerOpen } = jest.requireMock(
- "services/delivery-metrics",
+ const { recordCircuitBreakerOpen } = jest.requireMock(
+ "services/delivery-observability",
);
await processRecords([makeRecord()]);
- expect(emitCircuitBreakerOpen).toHaveBeenCalledWith("target-1");
+ expect(recordCircuitBreakerOpen).toHaveBeenCalledWith("target-1");
});
- it("does not emit CircuitBreakerOpen when recordResult returns failed", async () => {
+ it("does not record CircuitBreakerOpen when recordResult has no state change", async () => {
const targetCb = {
...DEFAULT_TARGET,
delivery: { circuitBreaker: { enabled: true } },
@@ -480,18 +511,21 @@ describe("processRecords", () => {
outcome: "transient_failure",
statusCode: 503,
});
- mockRecordResult.mockResolvedValue({ ok: false, state: "failed" });
+ mockRecordResult.mockResolvedValue({
+ circuitState: "open",
+ stateChanged: false,
+ });
- const { emitCircuitBreakerOpen } = jest.requireMock(
- "services/delivery-metrics",
+ const { recordCircuitBreakerOpen } = jest.requireMock(
+ "services/delivery-observability",
);
await processRecords([makeRecord()]);
- expect(emitCircuitBreakerOpen).not.toHaveBeenCalled();
+ expect(recordCircuitBreakerOpen).not.toHaveBeenCalled();
});
- it("does not emit CircuitBreakerOpen when recordResult returns closed", async () => {
+ it("does not record CircuitBreakerOpen when circuit is closed", async () => {
const targetCb = {
...DEFAULT_TARGET,
delivery: { circuitBreaker: { enabled: true } },
@@ -501,28 +535,61 @@ describe("processRecords", () => {
outcome: "transient_failure",
statusCode: 503,
});
- mockRecordResult.mockResolvedValue({ ok: true, state: "closed" });
+ mockRecordResult.mockResolvedValue({
+ circuitState: "closed",
+ stateChanged: false,
+ });
- const { emitCircuitBreakerOpen } = jest.requireMock(
- "services/delivery-metrics",
+ const { recordCircuitBreakerOpen } = jest.requireMock(
+ "services/delivery-observability",
);
await processRecords([makeRecord()]);
- expect(emitCircuitBreakerOpen).not.toHaveBeenCalled();
+ expect(recordCircuitBreakerOpen).not.toHaveBeenCalled();
});
- it("emits RateLimited metric on 429 response", async () => {
+ it("records CircuitBreakerClosed when recordResult indicates circuit closed", async () => {
+ const targetCb = {
+ ...DEFAULT_TARGET,
+ delivery: { circuitBreaker: { enabled: true } },
+ };
+ mockLoadTargetConfig.mockResolvedValue(targetCb);
+ mockDeliverPayload.mockResolvedValue({
+ outcome: "success",
+ statusCode: 200,
+ });
+ mockRecordResult.mockResolvedValue({
+ circuitState: "closed_recovery",
+ stateChanged: true,
+ });
+
+ const { recordCircuitBreakerClosed } = jest.requireMock(
+ "services/delivery-observability",
+ );
+
+ await processRecords([makeRecord()]);
+
+ expect(recordCircuitBreakerClosed).toHaveBeenCalledWith("target-1");
+ });
+
+ it("records RateLimited on 429 response", async () => {
mockDeliverPayload.mockResolvedValue({
outcome: "rate_limited",
retryAfterHeader: "60",
});
- const { emitRateLimited } = jest.requireMock("services/delivery-metrics");
+ const { recordDeliveryRateLimited } = jest.requireMock(
+ "services/delivery-observability",
+ );
await processRecords([makeRecord()]);
- expect(emitRateLimited).toHaveBeenCalledWith("target-1");
+ expect(recordDeliveryRateLimited).toHaveBeenCalledWith(
+ "client-1",
+ "target-1",
+ "test-message-id",
+ );
});
it("uses configured maxRetryDurationSeconds when set on target", async () => {
@@ -558,4 +625,30 @@ describe("processRecords", () => {
7_200_000,
);
});
+
+ it("groups records by target and processes each batch separately", async () => {
+ const record1 = makeRecord({ messageId: "msg-1" });
+ const record2 = makeRecord({
+ messageId: "msg-2",
+ body: JSON.stringify({
+ payload: {
+ data: [
+ {
+ type: "MessageStatus",
+ attributes: { messageStatus: "delivered" },
+ },
+ ],
+ },
+ subscriptionId: "sub-2",
+ targetId: "target-2",
+ }),
+ });
+
+ const failures = await processRecords([record1, record2]);
+
+ expect(failures).toEqual([]);
+ expect(mockAdmit).toHaveBeenCalledTimes(2);
+ expect(mockLoadTargetConfig).toHaveBeenCalledWith("client-1", "target-1");
+ expect(mockLoadTargetConfig).toHaveBeenCalledWith("client-1", "target-2");
+ });
});
diff --git a/lambdas/https-client-lambda/src/__tests__/record-result-lua.test.ts b/lambdas/https-client-lambda/src/__tests__/record-result-lua.test.ts
index 515f1377..00e04707 100644
--- a/lambdas/https-client-lambda/src/__tests__/record-result-lua.test.ts
+++ b/lambdas/https-client-lambda/src/__tests__/record-result-lua.test.ts
@@ -1,31 +1,35 @@
import recordResultLuaSrc from "services/record-result.lua";
import { createRedisStore, evalLua } from "__tests__/helpers/lua-redis-mock";
-// ARGV: [now, success, cooldownMs, decayPeriodMs, cbErrorThreshold, cbMinAttempts, cbWindowPeriodMs]
-// KEYS: [cbKey]
-// Returns: [ok (0|1), state] state: "closed" | "opened" | "failed"
+// ARGV: [now, consumedTokens, processingFailures, cooldownPeriodMs, recoveryPeriodMs, failureThreshold, minAttempts, samplePeriodMs]
+// KEYS: [epKey]
+// Returns: [circuitState, stateChanged]
+// circuitState: "open" | "half_open" | "closed_recovery" | "closed"
+// stateChanged: 0 | 1
type RecordResultArgs = {
now: number;
- success: boolean;
- cooldownMs: number;
- decayPeriodMs: number;
- cbErrorThreshold: number;
- cbMinAttempts: number;
- cbWindowPeriodMs: number;
+ consumedTokens: number;
+ processingFailures: number;
+ cooldownPeriodMs: number;
+ recoveryPeriodMs: number;
+ failureThreshold: number;
+ minAttempts: number;
+ samplePeriodMs: number;
};
const defaultArgs: RecordResultArgs = {
now: 1_000_000,
- success: true,
- cooldownMs: 60_000,
- decayPeriodMs: 300_000,
- cbErrorThreshold: 0.5,
- cbMinAttempts: 10,
- cbWindowPeriodMs: 60_000,
+ consumedTokens: 1,
+ processingFailures: 0,
+ cooldownPeriodMs: 120_000,
+ recoveryPeriodMs: 600_000,
+ failureThreshold: 0.3,
+ minAttempts: 5,
+ samplePeriodMs: 300_000,
};
-type RecordResultResult = [number, string];
+type RecordResultResult = [string, number];
function runRecordResult(
store: ReturnType,
@@ -35,15 +39,16 @@ function runRecordResult(
const merged = { ...defaultArgs, ...args };
return evalLua(
recordResultLuaSrc,
- [`cb:${targetId}`],
+ [`ep:${targetId}`],
[
merged.now.toString(),
- merged.success ? "1" : "0",
- merged.cooldownMs.toString(),
- merged.decayPeriodMs.toString(),
- merged.cbErrorThreshold.toString(),
- merged.cbMinAttempts.toString(),
- merged.cbWindowPeriodMs.toString(),
+ merged.consumedTokens.toString(),
+ merged.processingFailures.toString(),
+ merged.cooldownPeriodMs.toString(),
+ merged.recoveryPeriodMs.toString(),
+ merged.failureThreshold.toString(),
+ merged.minAttempts.toString(),
+ merged.samplePeriodMs.toString(),
],
store,
) as RecordResultResult;
@@ -51,323 +56,405 @@ function runRecordResult(
describe("record-result.lua", () => {
describe("success recording", () => {
- it("returns closed state for a successful result", () => {
+ it("returns closed state for a successful batch with no state change", () => {
const store = createRedisStore();
- const [ok, state] = runRecordResult(store, { success: true });
+ store.set(
+ "ep:t1",
+ new Map([
+ ["is_open", "0"],
+ ["switched_at", "0"],
+ ["sample_till", "9999999999"],
+ ]),
+ );
+
+ const [circuitState, stateChanged] = runRecordResult(store, {
+ consumedTokens: 5,
+ processingFailures: 0,
+ });
- expect(ok).toBe(1);
- expect(state).toBe("closed");
+ expect(circuitState).toBe("closed");
+ expect(stateChanged).toBe(0);
});
- it("increments attempt count without incrementing failures", () => {
+ it("increments cur_attempts without incrementing cur_failures", () => {
const store = createRedisStore();
- runRecordResult(store, { success: true });
+ store.set(
+ "ep:t1",
+ new Map([
+ ["is_open", "0"],
+ ["switched_at", "0"],
+ ["sample_till", "9999999999"],
+ ]),
+ );
+
+ runRecordResult(store, { consumedTokens: 3, processingFailures: 0 });
- const cbHash = store.get("cb:t1")!;
- expect(cbHash.get("cb_attempts")).toBe("1");
- expect(cbHash.get("cb_failures")).toBe("0");
+ const epHash = store.get("ep:t1")!;
+ expect(epHash.get("cur_attempts")).toBe("3");
+ expect(epHash.get("cur_failures")).toBe("0");
});
});
describe("failure recording", () => {
- it("increments both attempts and failures on error", () => {
+ it("increments both cur_attempts and cur_failures", () => {
const store = createRedisStore();
- runRecordResult(store, { success: false });
-
- const cbHash = store.get("cb:t1")!;
- expect(cbHash.get("cb_attempts")).toBe("1");
- expect(cbHash.get("cb_failures")).toBe("1");
- });
+ store.set(
+ "ep:t1",
+ new Map([
+ ["is_open", "0"],
+ ["switched_at", "0"],
+ ["sample_till", "9999999999"],
+ ]),
+ );
- it("returns failed state for a single failure below threshold", () => {
- const store = createRedisStore();
- const [ok, state] = runRecordResult(store, { success: false });
+ runRecordResult(store, { consumedTokens: 5, processingFailures: 1 });
- expect(ok).toBe(0);
- expect(state).toBe("failed");
+ const epHash = store.get("ep:t1")!;
+ expect(epHash.get("cur_attempts")).toBe("5");
+ expect(epHash.get("cur_failures")).toBe("1");
});
- it("stays closed when below error threshold", () => {
+ it("returns closed state for failures below threshold", () => {
const store = createRedisStore();
- const now = 1_000_000;
+ store.set(
+ "ep:t1",
+ new Map([
+ ["is_open", "0"],
+ ["switched_at", "0"],
+ ["sample_till", "9999999999"],
+ ]),
+ );
- for (let i = 0; i < 8; i++) {
- runRecordResult(store, { now, success: true });
- }
- for (let i = 0; i < 2; i++) {
- runRecordResult(store, { now, success: false });
- }
+ const [circuitState, stateChanged] = runRecordResult(store, {
+ consumedTokens: 1,
+ processingFailures: 1,
+ });
- const [ok, state] = runRecordResult(store, { now, success: true });
- expect(ok).toBe(1);
- expect(state).toBe("closed");
+ expect(circuitState).toBe("closed");
+ expect(stateChanged).toBe(0);
});
});
- describe("circuit opening", () => {
- it("opens circuit when error rate exceeds threshold", () => {
+ describe("recording guard — fully open", () => {
+ it("does not record attempts/failures when circuit is fully open", () => {
const store = createRedisStore();
const now = 1_000_000;
+ const switchedAt = now - 10_000;
- for (let i = 0; i < 4; i++) {
- const [, state] = runRecordResult(store, {
- now,
- success: false,
- cbMinAttempts: 5,
- cbErrorThreshold: 0.5,
- });
- expect(state).toBe("failed");
- }
-
- const [ok, state] = runRecordResult(store, {
+ store.set(
+ "ep:t1",
+ new Map([
+ ["is_open", "1"],
+ ["switched_at", switchedAt.toString()],
+ ["sample_till", "9999999999"],
+ ["cur_attempts", "0"],
+ ["cur_failures", "0"],
+ ]),
+ );
+
+ runRecordResult(store, {
now,
- success: false,
- cbMinAttempts: 5,
- cbErrorThreshold: 0.5,
+ cooldownPeriodMs: 120_000,
+ consumedTokens: 5,
+ processingFailures: 3,
});
- expect(ok).toBe(0);
- expect(state).toBe("opened");
+
+ const epHash = store.get("ep:t1")!;
+ expect(epHash.get("cur_attempts")).toBe("0");
+ expect(epHash.get("cur_failures")).toBe("0");
});
- it("does not open circuit when below minimum attempts", () => {
+ it("returns open when circuit is fully open and state unchanged", () => {
const store = createRedisStore();
const now = 1_000_000;
+ const switchedAt = now - 10_000;
- for (let i = 0; i < 4; i++) {
- runRecordResult(store, {
- now,
- success: false,
- cbMinAttempts: 10,
- });
- }
+ store.set(
+ "ep:t1",
+ new Map([
+ ["is_open", "1"],
+ ["switched_at", switchedAt.toString()],
+ ["sample_till", "9999999999"],
+ ]),
+ );
- const [ok, state] = runRecordResult(store, {
+ const [circuitState, stateChanged] = runRecordResult(store, {
now,
- success: false,
- cbMinAttempts: 10,
+ cooldownPeriodMs: 120_000,
+ consumedTokens: 1,
+ processingFailures: 0,
});
- expect(ok).toBe(0);
- expect(state).toBe("failed");
+
+ expect(circuitState).toBe("open");
+ expect(stateChanged).toBe(0);
});
+ });
- it("sets opened_until_ms with cooldown on open", () => {
+ describe("circuit opening", () => {
+ it("opens circuit when failure rate exceeds threshold", () => {
const store = createRedisStore();
- const now = 1_000_000;
- const cooldownMs = 30_000;
-
- for (let i = 0; i < 5; i++) {
- runRecordResult(store, {
- now,
- success: false,
- cbMinAttempts: 5,
- cbErrorThreshold: 0.5,
- cooldownMs,
- });
- }
-
- const cbHash = store.get("cb:t1")!;
- expect(Number(cbHash.get("opened_until_ms"))).toBe(now + cooldownMs);
+ store.set(
+ "ep:t1",
+ new Map([
+ ["is_open", "0"],
+ ["switched_at", "0"],
+ ["sample_till", "9999999999"],
+ ]),
+ );
+
+ const [circuitState, stateChanged] = runRecordResult(store, {
+ consumedTokens: 5,
+ processingFailures: 5,
+ minAttempts: 5,
+ failureThreshold: 0.3,
+ });
+ expect(circuitState).toBe("open");
+ expect(stateChanged).toBe(1);
});
- it("resets all counters on open", () => {
+ it("does not open circuit when below minimum attempts", () => {
const store = createRedisStore();
- const now = 1_000_000;
+ store.set(
+ "ep:t1",
+ new Map([
+ ["is_open", "0"],
+ ["switched_at", "0"],
+ ["sample_till", "9999999999"],
+ ]),
+ );
- for (let i = 0; i < 5; i++) {
- runRecordResult(store, {
- now,
- success: false,
- cbMinAttempts: 5,
- cbErrorThreshold: 0.5,
- });
- }
-
- const cbHash = store.get("cb:t1")!;
- expect(cbHash.get("cb_failures")).toBe("0");
- expect(cbHash.get("cb_attempts")).toBe("0");
- expect(cbHash.get("cb_window_from")).toBe("0");
- expect(cbHash.get("cb_prev_failures")).toBe("0");
- expect(cbHash.get("cb_prev_attempts")).toBe("0");
+ const [circuitState, stateChanged] = runRecordResult(store, {
+ consumedTokens: 3,
+ processingFailures: 3,
+ minAttempts: 5,
+ failureThreshold: 0.3,
+ });
+ expect(circuitState).toBe("closed");
+ expect(stateChanged).toBe(0);
});
- it("does not double-trip when circuit is already open", () => {
+ it("sets is_open and switched_at on open", () => {
const store = createRedisStore();
const now = 1_000_000;
- const openedUntil = now + 60_000;
-
store.set(
- "cb:t1",
+ "ep:t1",
new Map([
- ["opened_until_ms", openedUntil.toString()],
- ["cb_window_from", now.toString()],
+ ["is_open", "0"],
+ ["switched_at", "0"],
+ ["sample_till", "9999999999"],
]),
);
- for (let i = 0; i < 20; i++) {
- const [, state] = runRecordResult(store, {
- now,
- success: false,
- cbMinAttempts: 5,
- cbErrorThreshold: 0.5,
- });
- expect(state).toBe("failed");
- }
-
- const cbHash = store.get("cb:t1")!;
- expect(Number(cbHash.get("opened_until_ms"))).toBe(openedUntil);
+ runRecordResult(store, {
+ now,
+ consumedTokens: 5,
+ processingFailures: 5,
+ minAttempts: 5,
+ failureThreshold: 0.3,
+ });
+
+ const epHash = store.get("ep:t1")!;
+ expect(epHash.get("is_open")).toBe("1");
+ expect(Number(epHash.get("switched_at"))).toBe(now);
});
- });
- describe("two-window blended rate", () => {
- it("blends previous window failures into current assessment", () => {
+ it("resets all counters and sets sampleTill on open", () => {
const store = createRedisStore();
const now = 1_000_000;
- const cbWindowPeriodMs = 60_000;
-
+ const samplePeriodMs = 300_000;
store.set(
- "cb:t1",
+ "ep:t1",
new Map([
- ["cb_window_from", now.toString()],
- ["cb_prev_failures", "8"],
- ["cb_prev_attempts", "10"],
+ ["is_open", "0"],
+ ["switched_at", "0"],
+ ["sample_till", "9999999999"],
]),
);
- const [ok, state] = runRecordResult(store, {
+ runRecordResult(store, {
now,
- success: false,
- cbWindowPeriodMs,
- cbMinAttempts: 5,
- cbErrorThreshold: 0.5,
+ consumedTokens: 5,
+ processingFailures: 5,
+ minAttempts: 5,
+ failureThreshold: 0.3,
+ samplePeriodMs,
});
- expect(ok).toBe(0);
- expect(state).toBe("opened");
+
+ const epHash = store.get("ep:t1")!;
+ expect(epHash.get("cur_failures")).toBe("0");
+ expect(epHash.get("cur_attempts")).toBe("0");
+ expect(epHash.get("prev_failures")).toBe("0");
+ expect(epHash.get("prev_attempts")).toBe("0");
+ expect(Number(epHash.get("sample_till"))).toBe(now + samplePeriodMs);
});
+ });
- it("reduces previous window weight as current window ages", () => {
+ describe("circuit closing — half-open with successes", () => {
+ it("closes circuit when half-open and batch has successes", () => {
const store = createRedisStore();
- const cbWindowPeriodMs = 100_000;
- const t0 = 1_000_000;
- const nearEnd = t0 + cbWindowPeriodMs - 1;
+ const now = 1_000_000;
+ const switchedAt = now - 130_000;
store.set(
- "cb:t1",
+ "ep:t1",
new Map([
- ["cb_window_from", t0.toString()],
- ["cb_prev_failures", "10"],
- ["cb_prev_attempts", "10"],
+ ["is_open", "1"],
+ ["switched_at", switchedAt.toString()],
+ ["sample_till", "9999999999"],
]),
);
- for (let i = 0; i < 20; i++) {
- runRecordResult(store, {
- now: nearEnd,
- success: true,
- cbWindowPeriodMs,
- cbMinAttempts: 5,
- cbErrorThreshold: 0.5,
- });
- }
-
- const [, state] = runRecordResult(store, {
- now: nearEnd,
- success: false,
- cbWindowPeriodMs,
- cbMinAttempts: 5,
- cbErrorThreshold: 0.5,
+ const [circuitState, stateChanged] = runRecordResult(store, {
+ now,
+ cooldownPeriodMs: 120_000,
+ consumedTokens: 1,
+ processingFailures: 0,
});
- expect(state).toBe("failed");
+
+ expect(circuitState).toBe("closed_recovery");
+ expect(stateChanged).toBe(1);
+
+ const epHash = store.get("ep:t1")!;
+ expect(epHash.get("is_open")).toBe("0");
+ expect(Number(epHash.get("switched_at"))).toBe(now);
});
- it("ignores previous window when cbWindowPeriodMs is 0", () => {
+ it("does not close when half-open but all attempts failed", () => {
const store = createRedisStore();
const now = 1_000_000;
+ const switchedAt = now - 130_000;
store.set(
- "cb:t1",
+ "ep:t1",
new Map([
- ["cb_window_from", now.toString()],
- ["cb_prev_failures", "100"],
- ["cb_prev_attempts", "100"],
+ ["is_open", "1"],
+ ["switched_at", switchedAt.toString()],
+ ["sample_till", "9999999999"],
]),
);
- const [, state] = runRecordResult(store, {
+ const [circuitState, stateChanged] = runRecordResult(store, {
now,
- success: false,
- cbWindowPeriodMs: 0,
- cbMinAttempts: 5,
- cbErrorThreshold: 0.5,
+ cooldownPeriodMs: 120_000,
+ consumedTokens: 1,
+ processingFailures: 1,
});
- expect(state).toBe("failed");
+
+ expect(circuitState).toBe("half_open");
+ expect(stateChanged).toBe(0);
});
});
- describe("decay period", () => {
- it("preserves opened_until_ms during active decay", () => {
+ describe("sliding window management", () => {
+ it("promotes current to previous when sampleTill expires", () => {
const store = createRedisStore();
- const openedUntil = 1_060_000;
- const duringDecay = openedUntil + 100_000;
+ const now = 1_000_000;
+ const samplePeriodMs = 300_000;
+ const sampleTill = now - 1;
store.set(
- "cb:t1",
- new Map([["opened_until_ms", openedUntil.toString()]]),
+ "ep:t1",
+ new Map([
+ ["is_open", "0"],
+ ["switched_at", "0"],
+ ["sample_till", sampleTill.toString()],
+ ["cur_attempts", "10"],
+ ["cur_failures", "3"],
+ ["prev_attempts", "0"],
+ ["prev_failures", "0"],
+ ]),
);
- runRecordResult(store, {
- now: duringDecay,
- success: true,
- decayPeriodMs: 300_000,
- });
+ runRecordResult(store, { now, samplePeriodMs, consumedTokens: 1 });
- const cbHash = store.get("cb:t1")!;
- expect(Number(cbHash.get("opened_until_ms"))).toBe(openedUntil);
+ const epHash = store.get("ep:t1")!;
+ expect(epHash.get("prev_attempts")).toBe("10");
+ expect(epHash.get("prev_failures")).toBe("3");
+ expect(Number(epHash.get("sample_till"))).toBe(
+ sampleTill + samplePeriodMs,
+ );
});
- it("clears opened_until_ms after decay period elapses", () => {
+ it("complete reset when window is too old", () => {
const store = createRedisStore();
- const openedUntil = 1_060_000;
- const decayPeriodMs = 300_000;
- const afterDecay = openedUntil + decayPeriodMs + 1;
+ const now = 1_000_000;
+ const samplePeriodMs = 300_000;
+ const sampleTill = now - samplePeriodMs - 1;
store.set(
- "cb:t1",
- new Map([["opened_until_ms", openedUntil.toString()]]),
+ "ep:t1",
+ new Map([
+ ["is_open", "0"],
+ ["switched_at", "0"],
+ ["sample_till", sampleTill.toString()],
+ ["cur_attempts", "10"],
+ ["cur_failures", "3"],
+ ["prev_attempts", "5"],
+ ["prev_failures", "2"],
+ ]),
);
- runRecordResult(store, {
- now: afterDecay,
- success: true,
- decayPeriodMs,
- });
+ runRecordResult(store, { now, samplePeriodMs, consumedTokens: 1 });
- const cbHash = store.get("cb:t1")!;
- expect(cbHash.get("opened_until_ms")).toBe("0");
+ const epHash = store.get("ep:t1")!;
+ expect(epHash.get("prev_attempts")).toBe("0");
+ expect(epHash.get("prev_failures")).toBe("0");
+ expect(Number(epHash.get("sample_till"))).toBe(now + samplePeriodMs);
});
- it("clears opened_until_ms when circuit was never opened", () => {
+ it("interpolates using weight from sampleTill", () => {
const store = createRedisStore();
+ const samplePeriodMs = 300_000;
const now = 1_000_000;
+ const sampleTill = now + samplePeriodMs;
- runRecordResult(store, { now, success: true });
+ store.set(
+ "ep:t1",
+ new Map([
+ ["is_open", "0"],
+ ["switched_at", "0"],
+ ["sample_till", sampleTill.toString()],
+ ["prev_attempts", "10"],
+ ["prev_failures", "10"],
+ ]),
+ );
- const cbHash = store.get("cb:t1")!;
- expect(cbHash.get("opened_until_ms")).toBe("0");
+ // weight = (sampleTill - now) / samplePeriodMs = 1.0
+ // interpolated attempts = 10 * 1.0 + 5 = 15 (>= minAttempts 5)
+ // interpolated failures = 10 * 1.0 + 5 = 15
+ // failure rate = 15/15 = 1.0 > 0.3 → opens
+ const [circuitState, stateChanged] = runRecordResult(store, {
+ now,
+ samplePeriodMs,
+ consumedTokens: 5,
+ processingFailures: 5,
+ minAttempts: 5,
+ failureThreshold: 0.3,
+ });
+ expect(circuitState).toBe("open");
+ expect(stateChanged).toBe(1);
});
});
describe("state persistence", () => {
- it("writes all counter fields to redis", () => {
+ it("writes all sampling fields to redis", () => {
const store = createRedisStore();
+ store.set(
+ "ep:t1",
+ new Map([
+ ["is_open", "0"],
+ ["switched_at", "0"],
+ ["sample_till", "9999999999"],
+ ]),
+ );
runRecordResult(store);
- const cbHash = store.get("cb:t1")!;
- expect(cbHash.has("opened_until_ms")).toBe(true);
- expect(cbHash.has("cb_window_from")).toBe(true);
- expect(cbHash.has("cb_failures")).toBe(true);
- expect(cbHash.has("cb_attempts")).toBe(true);
- expect(cbHash.has("cb_prev_failures")).toBe(true);
- expect(cbHash.has("cb_prev_attempts")).toBe(true);
+ const epHash = store.get("ep:t1")!;
+ expect(epHash.has("cur_attempts")).toBe(true);
+ expect(epHash.has("cur_failures")).toBe(true);
+ expect(epHash.has("prev_attempts")).toBe(true);
+ expect(epHash.has("prev_failures")).toBe(true);
+ expect(epHash.has("sample_till")).toBe(true);
});
});
});
diff --git a/lambdas/https-client-lambda/src/handler.ts b/lambdas/https-client-lambda/src/handler.ts
index 28fcc6b9..b129fc8d 100644
--- a/lambdas/https-client-lambda/src/handler.ts
+++ b/lambdas/https-client-lambda/src/handler.ts
@@ -12,7 +12,6 @@ import {
OUTCOME_SUCCESS,
deliverPayload,
} from "services/delivery/https-client";
-import type { DeliveryResult } from "services/delivery/https-client";
import { sendToDlq } from "services/dlq-sender";
import { changeVisibility } from "services/sqs-visibility";
import {
@@ -26,7 +25,6 @@ import {
recordResult,
} from "services/endpoint-gate";
import { getRedisClient } from "services/redis-client";
-import { VisibilityManagedError } from "services/visibility-managed-error";
import {
recordAdmissionDenied,
recordCircuitBreakerClosed,
@@ -47,13 +45,20 @@ const DEFAULT_MAX_RETRY_DURATION_MS = 7_200_000; // 2 hours
const DEFAULT_CONCURRENCY_LIMIT = 5;
const gateConfig: EndpointGateConfig = {
- burstCapacity: Number(process.env.TOKEN_BUCKET_BURST_CAPACITY ?? "10"),
- cbProbeIntervalMs: Number(process.env.CB_PROBE_INTERVAL_MS ?? "60000"),
- decayPeriodMs: Number(process.env.CB_DECAY_PERIOD_MS ?? "300000"),
- cbWindowPeriodMs: Number(process.env.CB_WINDOW_PERIOD_MS ?? "60000"),
- cbErrorThreshold: Number(process.env.CB_ERROR_THRESHOLD ?? "0.5"),
- cbMinAttempts: Number(process.env.CB_MIN_ATTEMPTS ?? "10"),
- cbCooldownMs: Number(process.env.CB_COOLDOWN_MS ?? "60000"),
+ // Max tokens the bucket can hold — absorbs short traffic bursts without throttling (default: 2250)
+ burstCapacity: Number(process.env.TOKEN_BUCKET_BURST_CAPACITY ?? "2250"),
+ // Probe rate to test endpoint recovery when half-open (default: 1/60 req/s)
+ probeRateLimit: Number(process.env.CB_PROBE_RATE_LIMIT ?? String(1 / 60)),
+ // Linear ramp-up after circuit closes, avoids flooding a freshly recovered endpoint (default: 10 min)
+ recoveryPeriodMs: Number(process.env.CB_RECOVERY_PERIOD_MS ?? "600000"),
+ // Sliding window over which failure rates are sampled (default: 5 min)
+ samplePeriodMs: Number(process.env.CB_SAMPLE_PERIOD_MS ?? "300000"),
+ // Failure rate within the sample window that triggers circuit open (default: 30%)
+ failureThreshold: Number(process.env.CB_FAILURE_THRESHOLD ?? "0.3"),
+ // Minimum attempts in the sample window before the failure rate is evaluated (default: 5 attempts)
+ minAttempts: Number(process.env.CB_MIN_ATTEMPTS ?? "5"),
+ // Full block after circuit opens, before half-open probes begin (default: 2 min)
+ cooldownPeriodMs: Number(process.env.CB_COOLDOWN_PERIOD_MS ?? "120000"),
};
type CallbackDeliveryMessage = {
@@ -62,223 +67,257 @@ type CallbackDeliveryMessage = {
targetId: string;
};
-async function checkAdmission(
- redis: RedisClientType,
- targetId: string,
- invocationRateLimit: number,
- cbEnabled: boolean,
- clientId: string,
- record: SQSRecord,
- correlationId?: string,
-): Promise {
- const gateResult = await admit(
- redis,
- targetId,
- invocationRateLimit,
- cbEnabled,
- gateConfig,
- );
+type TargetBatch = {
+ targetId: string;
+ records: SQSRecord[];
+ messages: CallbackDeliveryMessage[];
+};
- if (!gateResult.allowed) {
- const delaySec = Math.ceil(gateResult.retryAfterMs / 1000);
- recordAdmissionDenied(clientId, targetId, gateResult.reason, correlationId);
- await changeVisibility(record.receiptHandle, delaySec);
- throw new VisibilityManagedError(`Admission denied: ${gateResult.reason}`);
+function groupByTarget(records: SQSRecord[]): TargetBatch[] {
+ const groups = new Map<
+ string,
+ { records: SQSRecord[]; messages: CallbackDeliveryMessage[] }
+ >();
+
+ for (const record of records) {
+ const message: CallbackDeliveryMessage = JSON.parse(record.body);
+ const existing = groups.get(message.targetId);
+ if (existing) {
+ existing.records.push(record);
+ existing.messages.push(message);
+ } else {
+ groups.set(message.targetId, { records: [record], messages: [message] });
+ }
}
-}
-const OUTCOME_DELIVERED = "delivered" as const;
-const OUTCOME_DLQ = "dlq" as const;
-type RecordOutcome = typeof OUTCOME_DELIVERED | typeof OUTCOME_DLQ;
+ return [...groups.entries()].map(
+ ([targetId, { messages, records: recs }]) => ({
+ targetId,
+ records: recs,
+ messages,
+ }),
+ );
+}
-async function handleDeliveryResult(
- result: DeliveryResult,
+async function deliverRecord(
record: SQSRecord,
- redis: RedisClientType,
+ message: CallbackDeliveryMessage,
+ target: Awaited>,
+ applicationId: string,
clientId: string,
- targetId: string,
- cbEnabled: boolean,
- correlationId?: string,
-): Promise {
+): Promise<{ success: boolean }> {
+ const correlationId = message.payload.data[0]?.attributes?.messageId;
+
+ const maxRetryDurationMs =
+ target.delivery?.maxRetryDurationSeconds === undefined
+ ? DEFAULT_MAX_RETRY_DURATION_MS
+ : target.delivery.maxRetryDurationSeconds * 1000;
+
+ const firstReceivedMs = Number(
+ record.attributes.ApproximateFirstReceiveTimestamp,
+ );
+
+ if (isWindowExhausted(firstReceivedMs, maxRetryDurationMs)) {
+ recordRetryWindowExhausted(clientId, message.targetId, correlationId);
+ await sendToDlq(record.body);
+ return { success: true };
+ }
+
+ const agent = await buildAgent(target);
+ const signature = signPayload(
+ applicationId,
+ target.apiKey.headerValue,
+ message.payload,
+ );
+ const payloadJson = JSON.stringify(message.payload);
+
+ recordDeliveryAttempt(clientId, message.targetId, correlationId);
+ const deliveryStart = Date.now();
+ const result = await deliverPayload(target, payloadJson, signature, agent);
+ recordDeliveryDuration(message.targetId, Date.now() - deliveryStart);
+
if (result.outcome === OUTCOME_SUCCESS) {
- if (cbEnabled) {
- const cbOutcome = await recordResult(redis, targetId, true, gateConfig);
- if (cbOutcome.ok && cbOutcome.state === "closed") {
- recordCircuitBreakerClosed(targetId, correlationId);
- }
- }
- recordDeliverySuccess(clientId, targetId, correlationId);
- return OUTCOME_DELIVERED;
+ recordDeliverySuccess(clientId, message.targetId, correlationId);
+ return { success: true };
}
if (result.outcome === OUTCOME_PERMANENT_FAILURE) {
recordDeliveryPermanentFailure(
clientId,
- targetId,
+ message.targetId,
result.statusCode,
result.errorCode,
correlationId,
);
await sendToDlq(record.body, result);
- return OUTCOME_DLQ;
+ return { success: true };
}
if (result.outcome === OUTCOME_RATE_LIMITED) {
const receiveCount = Number(record.attributes.ApproximateReceiveCount);
- recordDeliveryRateLimited(clientId, targetId, correlationId);
+ recordDeliveryRateLimited(clientId, message.targetId, correlationId);
await handleRateLimitedRecord(
record,
clientId,
- targetId,
+ message.targetId,
result.retryAfterHeader,
receiveCount,
);
- return OUTCOME_DELIVERED; // unreachable — handleRateLimitedRecord always throws
+ return { success: true };
}
const receiveCount = Number(record.attributes.ApproximateReceiveCount);
const backoffSec = jitteredBackoffSeconds(receiveCount);
- if (cbEnabled) {
- const cbOutcome = await recordResult(redis, targetId, false, gateConfig);
- if (cbOutcome.state === "opened") {
- recordCircuitBreakerOpen(targetId, correlationId);
- }
- }
recordDeliveryFailure(
clientId,
- targetId,
+ message.targetId,
result.statusCode,
backoffSec,
receiveCount,
correlationId,
);
await changeVisibility(record.receiptHandle, backoffSec);
- throw new VisibilityManagedError(`Transient failure: ${result.statusCode}`);
+ return { success: false };
}
-async function processRecord(
- record: SQSRecord,
+async function processTargetBatch(
+ batch: TargetBatch,
redis: RedisClientType,
-): Promise {
- const { CLIENT_ID } = process.env;
- if (!CLIENT_ID) {
- throw new Error("CLIENT_ID is required");
+ clientId: string,
+ concurrencyLimit: number,
+): Promise {
+ const target = await loadTargetConfig(clientId, batch.targetId);
+ const cbEnabled = target.delivery?.circuitBreaker?.enabled ?? false;
+
+ const gateResult = await admit(
+ redis,
+ batch.targetId,
+ target.invocationRateLimit,
+ cbEnabled,
+ batch.records.length,
+ gateConfig,
+ );
+
+ if (!gateResult.allowed) {
+ const baseDelaySec = Math.ceil(gateResult.retryAfterMs / 1000);
+ recordAdmissionDenied(clientId, batch.targetId, gateResult.reason);
+ const failures: SQSBatchItemFailure[] = [];
+ for (const record of batch.records) {
+ // eslint-disable-next-line sonarjs/pseudo-random -- jitter for backoff, not security-sensitive
+ const jitterSec = Math.floor(Math.random() * 5);
+ await changeVisibility(record.receiptHandle, baseDelaySec + jitterSec);
+ failures.push({ itemIdentifier: record.messageId });
+ }
+ return failures;
}
- const message: CallbackDeliveryMessage = JSON.parse(record.body);
- const { payload, targetId } = message;
- const messageId = payload.data[0]?.attributes?.messageId;
+ const { consumedTokens } = gateResult;
+ const admitted = batch.records.slice(0, consumedTokens);
+ const rejected = batch.records.slice(consumedTokens);
+ const admittedMessages = batch.messages.slice(0, consumedTokens);
- logger.info("Processing delivery", {
- clientId: CLIENT_ID,
- targetId,
- messageId,
- sqsMessageId: record.messageId,
- receiveCount: record.attributes.ApproximateReceiveCount,
- });
+ const applicationId = await getApplicationId(clientId);
- const target = await loadTargetConfig(CLIENT_ID, targetId);
- const maxRetryDurationMs =
- target.delivery?.maxRetryDurationSeconds === undefined
- ? DEFAULT_MAX_RETRY_DURATION_MS
- : target.delivery.maxRetryDurationSeconds * 1000;
+ const failures: SQSBatchItemFailure[] = [];
+ let processingFailures = 0;
- const firstReceivedMs = Number(
- record.attributes.ApproximateFirstReceiveTimestamp,
+ const deliveryResults = await pMap(
+ admitted,
+ async (record, index): Promise<{ record: SQSRecord; success: boolean }> => {
+ try {
+ const outcome = await deliverRecord(
+ record,
+ admittedMessages[index],
+ target,
+ applicationId,
+ clientId,
+ );
+ return { record, success: outcome.success };
+ } catch (error) {
+ logger.error("Failed to process record", {
+ messageId: record.messageId,
+ err: error,
+ });
+ const receiveCount = Number(record.attributes.ApproximateReceiveCount);
+ await changeVisibility(
+ record.receiptHandle,
+ jitteredBackoffSeconds(receiveCount),
+ );
+ return { record, success: false };
+ }
+ },
+ { concurrency: concurrencyLimit },
);
- if (isWindowExhausted(firstReceivedMs, maxRetryDurationMs)) {
- recordRetryWindowExhausted(CLIENT_ID, targetId, messageId);
- await sendToDlq(record.body);
- return OUTCOME_DLQ;
+ for (const { record, success } of deliveryResults) {
+ if (!success) {
+ processingFailures += 1;
+ failures.push({ itemIdentifier: record.messageId });
+ }
}
- const applicationId = await getApplicationId(CLIENT_ID);
- const cbEnabled = target.delivery?.circuitBreaker?.enabled ?? false;
-
- await checkAdmission(
- redis,
- targetId,
- target.invocationRateLimit,
- cbEnabled,
- CLIENT_ID,
- record,
- messageId,
- );
-
- const agent = await buildAgent(target);
- const signature = signPayload(
- applicationId,
- target.apiKey.headerValue,
- payload,
- );
- const payloadJson = JSON.stringify(payload);
+ if (cbEnabled && consumedTokens > 0) {
+ const cbOutcome = await recordResult(
+ redis,
+ batch.targetId,
+ consumedTokens,
+ processingFailures,
+ gateConfig,
+ );
+ if (cbOutcome.stateChanged && cbOutcome.circuitState === "open") {
+ recordCircuitBreakerOpen(batch.targetId);
+ }
+ if (
+ cbOutcome.stateChanged &&
+ cbOutcome.circuitState === "closed_recovery"
+ ) {
+ recordCircuitBreakerClosed(batch.targetId);
+ }
+ }
- recordDeliveryAttempt(CLIENT_ID, targetId, messageId);
- const deliveryStart = Date.now();
- const result = await deliverPayload(target, payloadJson, signature, agent);
- recordDeliveryDuration(targetId, Date.now() - deliveryStart);
+ for (const record of rejected) {
+ failures.push({ itemIdentifier: record.messageId });
+ }
- return handleDeliveryResult(
- result,
- record,
- redis,
- CLIENT_ID,
- targetId,
- cbEnabled,
- messageId,
- );
+ return failures;
}
export async function processRecords(
records: SQSRecord[],
): Promise {
- resetMetrics();
+ const { CLIENT_ID } = process.env;
+ if (!CLIENT_ID) {
+ throw new Error("CLIENT_ID is required");
+ }
- logger.info("Batch received", { batchSize: records.length });
+ resetMetrics();
const concurrencyLimit = Number(
process.env.CONCURRENCY_LIMIT ?? String(DEFAULT_CONCURRENCY_LIMIT),
);
+ logger.info("Batch received", { batchSize: records.length });
+
const redis = await getRedisClient();
+ const targetBatches = groupByTarget(records);
- const results = await pMap(
- records,
- async (record): Promise => {
- try {
- return await processRecord(record, redis);
- } catch (error) {
- if (!(error instanceof VisibilityManagedError)) {
- logger.error("Failed to process record", {
- messageId: record.messageId,
- err: error,
- });
- const receiveCount = Number(
- record.attributes.ApproximateReceiveCount,
- );
- await changeVisibility(
- record.receiptHandle,
- jitteredBackoffSeconds(receiveCount),
- );
- }
- return { itemIdentifier: record.messageId };
- }
- },
- { concurrency: concurrencyLimit },
- );
+ const allFailures: SQSBatchItemFailure[] = [];
+
+ for (const batch of targetBatches) {
+ const batchFailures = await processTargetBatch(
+ batch,
+ redis,
+ CLIENT_ID,
+ concurrencyLimit,
+ );
+ allFailures.push(...batchFailures);
+ }
- await flushMetrics();
- const failures = results.filter(
- (r): r is SQSBatchItemFailure => typeof r === "object",
- );
- const deliveredCount = results.filter((r) => r === OUTCOME_DELIVERED).length;
- const dlqCount = results.filter((r) => r === OUTCOME_DLQ).length;
logger.info("Batch complete", {
batchSize: records.length,
- deliveredCount,
- dlqCount,
- failureCount: failures.length,
+ failureCount: allFailures.length,
});
- return failures;
+
+ await flushMetrics();
+ return allFailures;
}
diff --git a/lambdas/https-client-lambda/src/services/admit.lua b/lambdas/https-client-lambda/src/services/admit.lua
index fd56decb..53b6977c 100644
--- a/lambdas/https-client-lambda/src/services/admit.lua
+++ b/lambdas/https-client-lambda/src/services/admit.lua
@@ -1,203 +1,108 @@
--- admit.lua — Decides whether a request to an endpoint is allowed.
+-- admit.lua — Pre-processing: determines rate limit and consumes tokens.
--
--- Three sequential checks run atomically:
--- 1. Circuit breaker — is the endpoint currently healthy?
--- 2. Sliding window — roll the two-window error-rate accounting state if needed
--- 3. Token bucket — is the endpoint within its rate limit?
+-- Two sequential steps run atomically:
+-- 1. Circuit breaker — determine effective rate from circuit state
+-- 2. Token bucket — consume tokens for the target batch
--
--- A request is allowed only when all three checks pass.
+-- The circuit has four states:
+-- Open (during cooldown): rate = 0, complete block, bucket untouched
+-- Half-open (after cooldown): rate = probeRateLimit
+-- Recovering (closed, during recovery period): linear ramp-up
+-- Normal (closed): full configured rate
--
--- While the circuit is open, a timed probe is let through at most once per
--- cbProbeIntervalMs so the caller can test whether the endpoint has recovered.
--- The probe bypasses the rate limit — counting it here would skew a
--- low-volume probe signal against the recovery decision.
---
--- After the circuit closes, the token fill rate ramps up linearly from
--- near-zero to full over decayPeriodMs to avoid a thundering herd on recovery.
---
--- Returns: { allowed (0|1), reason, retryAfterMs, effectiveRate }
+-- Returns: { consumedTokens, reason, retryAfterMs, effectiveRate }
-- Keys
-local cbKey = KEYS[1] -- cb:{endpoint} circuit breaker state hash
-local rlKey = KEYS[2] -- rl:{endpoint} rate limiter state hash
+local epKey = KEYS[1] -- ep:{targetId} combined endpoint state hash
-- Arguments
-local now = tonumber(ARGV[1]) or 0 -- current wall-clock time (ms)
-local capacity = tonumber(ARGV[2]) or 0 -- token bucket maximum capacity
-local refillPerSec = tonumber(ARGV[3]) or 0 -- full token fill rate (tokens/sec)
-local cooldownMs = tonumber(ARGV[4]) or 0 -- how long the circuit stays open (ms)
-local decayPeriodMs = tonumber(ARGV[5]) or 0 -- ramp-up window after circuit closes (ms)
-local cbWindowPeriodMs = tonumber(ARGV[6]) or 0 -- error-rate sliding window duration (ms)
-local cbProbeIntervalMs = tonumber(ARGV[7]) or 0 -- minimum gap between probe requests (ms; 0 = no probes)
-
--- TTL policy: circuit breaker state must outlive the cooldown window so that
--- the ramp-up period remains visible to subsequent calls after a close.
--- Rate limiter state needs only a short idle window.
-local cbTtlSeconds = math.ceil(cooldownMs / 1000) + 60
-local rlTtlSeconds = 120
+local now = tonumber(ARGV[1]) or 0
+local capacity = tonumber(ARGV[2]) or 0
+local targetRateLimit = tonumber(ARGV[3]) or 0
+local cooldownMs = tonumber(ARGV[4]) or 0
+local recoveryPeriodMs = tonumber(ARGV[5]) or 0
+local probeRateLimit = tonumber(ARGV[6]) or 0
+local targetBatchSize = tonumber(ARGV[7]) or 0
--------------------------------------------------------------------------------
-- LOAD STATE
--------------------------------------------------------------------------------
-local cb = redis.call("HMGET", cbKey,
- "opened_until_ms", "cb_window_from", "cb_failures", "cb_attempts", "last_probe_ms",
- "cb_prev_failures", "cb_prev_attempts")
-local openedUntil = tonumber(cb[1] or "0")
-local cbWindowFrom = tonumber(cb[2] or "0")
-local cbFailures = tonumber(cb[3] or "0")
-local cbAttempts = tonumber(cb[4] or "0")
-local lastProbeMs = tonumber(cb[5] or "0")
-local cbPrevFailures = tonumber(cb[6] or "0")
-local cbPrevAttempts = tonumber(cb[7] or "0")
-
-local rl = redis.call("HMGET", rlKey, "tokens", "last_refill_ms")
-local tokens = tonumber(rl[1] or capacity)
-local lastRefill = tonumber(rl[2] or now)
+local state = redis.call("HMGET", epKey,
+ "is_open", "switched_at", "bucket_tokens", "bucket_refilled_at")
+local isOpenRaw = state[1]
+local needInit = isOpenRaw == false or isOpenRaw == nil
+local isOpen = needInit or tonumber(isOpenRaw) == 1
+local switchedAt = needInit and 0 or tonumber(state[2] or "0")
+local bucketTokens = tonumber(state[3] or "0")
+local bucketRefilledAt = needInit and now or tonumber(state[4] or "0")
--------------------------------------------------------------------------------
--- 1. CIRCUIT BREAKER
---
--- The circuit is open when openedUntil is set and has not yet elapsed.
--- All requests are rejected while open to give the endpoint time to recover.
---
--- Timed probes: once per cbProbeIntervalMs a single request is allowed
--- through even while the circuit is open. The caller must record the
--- outcome via record-result.lua; a successful probe will close the circuit
--- and trigger the ramp-up phase.
+-- 1. CIRCUIT BREAKER — determine effective rate
--------------------------------------------------------------------------------
-if openedUntil > 0 and now < openedUntil then
- -- Allow a probe through if the probe interval has elapsed
- if cbProbeIntervalMs > 0 and (now - lastProbeMs) >= cbProbeIntervalMs then
- lastProbeMs = now
- redis.call("HSET", cbKey,
- "opened_until_ms", openedUntil,
- "cb_window_from", cbWindowFrom,
- "cb_failures", cbFailures,
- "cb_attempts", cbAttempts,
- "last_probe_ms", lastProbeMs,
- "cb_prev_failures", cbPrevFailures,
- "cb_prev_attempts", cbPrevAttempts
- )
- redis.call("EXPIRE", cbKey, cbTtlSeconds)
- return { 1, "probe", 0, 0 }
- end
-
- -- Circuit is open and no probe slot is available — reject
- return { 0, "circuit_open", openedUntil - now, 0 }
-end
+local isHalfOpen = isOpen and now > switchedAt + cooldownMs
+local isRecovering = (not isOpen) and now < switchedAt + recoveryPeriodMs
---------------------------------------------------------------------------------
--- 2. SLIDING WINDOW
---
--- Two windows (current + previous) together approximate a sliding window over
--- cbWindowPeriodMs. When the current window expires it is promoted to previous
--- and a fresh current window starts. record-result.lua blends the two windows
--- using a time-based weight to smooth the error rate across the boundary rather
--- than resetting it to zero at expiry.
---
--- record-result.lua is responsible for incrementing the counters; this script
--- is only responsible for rolling the window boundary forward when it expires.
---------------------------------------------------------------------------------
+local effectiveRate
-if cbWindowFrom == 0 then
- -- No window exists yet — start one now
- cbWindowFrom = now
-elseif (now - cbWindowFrom) > cbWindowPeriodMs then
- -- Current window has expired — roll it forward
- if (now - cbWindowFrom) > (2 * cbWindowPeriodMs) then
- -- Both current and previous windows are stale: a long quiet period means
- -- old failure counts are no longer relevant to the health of the endpoint.
- cbPrevFailures = 0
- cbPrevAttempts = 0
+if isOpen then
+ if isHalfOpen then
+ effectiveRate = probeRateLimit
+ else
+ return { 0, "circuit_open", (switchedAt + cooldownMs) - now, 0 }
+ end
+else
+ if isRecovering then
+ local rampRange = math.max(0, targetRateLimit - probeRateLimit)
+ local rampProgress = math.max(0, now - switchedAt) / recoveryPeriodMs
+ effectiveRate = probeRateLimit + rampProgress * rampRange
else
- -- Promote current → previous so it can be blended with the new current window
- cbPrevFailures = cbFailures
- cbPrevAttempts = cbAttempts
+ effectiveRate = targetRateLimit
end
- cbFailures = 0
- cbAttempts = 0
- cbWindowFrom = now
end
--------------------------------------------------------------------------------
--- 3. TOKEN BUCKET
+-- 2. TOKEN BUCKET — batch consumption
--
--- Refills tokens based on elapsed time, then tries to consume one.
--- If no tokens are available the request is rate-limited.
+-- Generate tokens based on elapsed time, then consume as many as needed for
+-- the batch, up to the number available.
--
--- Ramp-up: after the circuit closes (openedUntil is set but in the past),
--- effectiveRate scales linearly from near-zero to the full refillPerSec over
--- decayPeriodMs. This deliberately slows recovery traffic so a flapping
--- endpoint is not immediately overwhelmed.
--- Once decayPeriodMs elapses, openedUntil is cleared and the full rate resumes.
+-- bucketRefilledAt tracks the point in time up to which tokens have been
+-- generated. We advance it by exactly the time needed to produce the whole
+-- tokens we generated (generationTime), rather than setting it to `now`.
+--
+-- Why not `now`? Token generation uses floor(), so any sub-token fractional
+-- time is truncated. Setting bucketRefilledAt = now would discard that
+-- remainder, meaning the next call starts its elapsed-time calculation from
+-- a later point than it should. Over many calls this causes token leakage —
+-- the bucket refills slower than the configured rate. By advancing only by
+-- generationTime, the leftover fractional time carries over to the next call.
--------------------------------------------------------------------------------
-local effectiveRate = refillPerSec
-
-if openedUntil > 0 and now > openedUntil and decayPeriodMs > 0 then
- -- Circuit has recently closed — apply linear ramp-up
- local sinceClose = now - openedUntil
- if sinceClose >= decayPeriodMs then
- -- Decay period fully elapsed — restore full rate and clear the CB timestamp
- openedUntil = 0
- else
- -- Still within decay period — scale fill rate proportionally to time elapsed
- local fraction = sinceClose / decayPeriodMs
- effectiveRate = math.max(1, math.floor(refillPerSec * fraction))
- end
+if isOpen then
+ bucketTokens = 0
end
--- Refill tokens based on time elapsed since last refill
-local elapsed = now - lastRefill
-if elapsed > 0 then
- local refill = math.floor((elapsed * effectiveRate) / 1000)
- if refill > 0 then
- tokens = math.min(capacity, tokens + refill)
- lastRefill = now
- end
-end
+local generatedTokens = math.floor((now - bucketRefilledAt) * effectiveRate / 1000)
+local availTokens = math.min(capacity, bucketTokens + generatedTokens)
+local consumedTokens = math.min(targetBatchSize, availTokens)
--- Not enough tokens — rate-limited
--- TTL is intentionally not refreshed here; it was set on the last allowed call.
-if tokens < 1 then
- redis.call("HSET", cbKey,
- "opened_until_ms", openedUntil,
- "cb_window_from", cbWindowFrom,
- "cb_failures", cbFailures,
- "cb_attempts", cbAttempts,
- "cb_prev_failures", cbPrevFailures,
- "cb_prev_attempts", cbPrevAttempts
- )
- redis.call("HSET", rlKey,
- "tokens", tokens,
- "last_refill_ms", lastRefill
- )
- return { 0, "rate_limited", 1000, effectiveRate }
+bucketTokens = availTokens - consumedTokens
+if generatedTokens > 0 and effectiveRate > 0 then
+ local generationTime = generatedTokens * 1000 / effectiveRate
+ bucketRefilledAt = bucketRefilledAt + generationTime
end
--- Consume one token
-tokens = tokens - 1
-
--------------------------------------------------------------------------------
--- 4. PERSIST STATE AND ALLOW
+-- 3. PERSIST STATE AND RETURN
--------------------------------------------------------------------------------
-redis.call("HSET", cbKey,
- "opened_until_ms", openedUntil,
- "cb_window_from", cbWindowFrom,
- "cb_failures", cbFailures,
- "cb_attempts", cbAttempts,
- "cb_prev_failures", cbPrevFailures,
- "cb_prev_attempts", cbPrevAttempts
-)
-redis.call("HSET", rlKey,
- "tokens", tokens,
- "last_refill_ms", lastRefill
+redis.call("HSET", epKey,
+ "bucket_tokens", bucketTokens,
+ "bucket_refilled_at", bucketRefilledAt
)
-redis.call("EXPIRE", cbKey, cbTtlSeconds)
-redis.call("EXPIRE", rlKey, rlTtlSeconds)
-
-return { 1, "allowed", 0, effectiveRate }
+local reason = consumedTokens < 1 and "rate_limited" or "allowed"
+local retryAfter = consumedTokens < 1 and 1000 or 0
+return { consumedTokens, reason, retryAfter, effectiveRate }
diff --git a/lambdas/https-client-lambda/src/services/endpoint-gate.ts b/lambdas/https-client-lambda/src/services/endpoint-gate.ts
index c2d85439..8870e5d8 100644
--- a/lambdas/https-client-lambda/src/services/endpoint-gate.ts
+++ b/lambdas/https-client-lambda/src/services/endpoint-gate.ts
@@ -5,7 +5,7 @@ import recordResultLuaSrc from "services/record-result.lua";
export type AdmitResultAllowed = {
allowed: true;
- probe: boolean;
+ consumedTokens: number;
effectiveRate: number;
};
@@ -18,18 +18,21 @@ export type AdmitResultDenied = {
export type AdmitResult = AdmitResultAllowed | AdmitResultDenied;
-export type RecordResultOutcome =
- | { ok: true; state: "closed" }
- | { ok: false; state: "opened" | "failed" };
+export type CircuitState = "open" | "half_open" | "closed_recovery" | "closed";
+
+export type RecordResultOutcome = {
+ circuitState: CircuitState;
+ stateChanged: boolean;
+};
export type EndpointGateConfig = {
burstCapacity: number;
- cbProbeIntervalMs: number;
- decayPeriodMs: number;
- cbWindowPeriodMs: number;
- cbErrorThreshold: number;
- cbMinAttempts: number;
- cbCooldownMs: number;
+ probeRateLimit: number;
+ recoveryPeriodMs: number;
+ samplePeriodMs: number;
+ failureThreshold: number;
+ minAttempts: number;
+ cooldownPeriodMs: number;
};
let admitSha: string | undefined;
@@ -76,22 +79,21 @@ export async function admit(
targetId: string,
refillPerSec: number,
cbEnabled: boolean,
+ targetBatchSize: number,
config: EndpointGateConfig,
): Promise {
- const cbKey = `cb:{${targetId}}`;
- const rlKey = `rl:{${targetId}}`;
+ const epKey = `ep:{${targetId}}`;
const now = Date.now().toString();
- const probeIntervalMs = cbEnabled ? config.cbProbeIntervalMs.toString() : "0";
+ const probeRate = cbEnabled ? config.probeRateLimit.toString() : "0";
const args = [
now,
config.burstCapacity.toString(),
- // eslint-disable-next-line sonarjs/null-dereference
- refillPerSec.toString(),
- config.cbCooldownMs.toString(),
- config.decayPeriodMs.toString(),
- config.cbWindowPeriodMs.toString(),
- probeIntervalMs,
+ String(refillPerSec),
+ config.cooldownPeriodMs.toString(),
+ config.recoveryPeriodMs.toString(),
+ probeRate,
+ String(targetBatchSize),
];
if (!admitSha) {
@@ -102,16 +104,16 @@ export async function admit(
client,
admitLuaSrc,
admitSha,
- [cbKey, rlKey],
+ [epKey],
args,
)) as [number, string, number, number];
- const [allowed, reason, retryAfterMs, effectiveRate] = raw;
+ const [consumedOrFlag, reason, retryAfterMs, effectiveRate] = raw;
- if (allowed === 1) {
+ if (reason === "allowed" || reason === "probe") {
return {
allowed: true,
- probe: reason === "probe",
+ consumedTokens: Number(consumedOrFlag),
effectiveRate: Number(effectiveRate),
};
}
@@ -127,20 +129,22 @@ export async function admit(
export async function recordResult(
client: RedisClientType,
targetId: string,
- success: boolean,
+ consumedTokens: number,
+ processingFailures: number,
config: EndpointGateConfig,
): Promise {
- const cbKey = `cb:{${targetId}}`;
+ const epKey = `ep:{${targetId}}`;
const now = Date.now().toString();
const args = [
now,
- success ? "1" : "0",
- config.cbCooldownMs.toString(),
- config.decayPeriodMs.toString(),
- config.cbErrorThreshold.toString(),
- config.cbMinAttempts.toString(),
- config.cbWindowPeriodMs.toString(),
+ String(consumedTokens),
+ String(processingFailures),
+ config.cooldownPeriodMs.toString(),
+ config.recoveryPeriodMs.toString(),
+ config.failureThreshold.toString(),
+ config.minAttempts.toString(),
+ config.samplePeriodMs.toString(),
];
if (!recordResultSha) {
@@ -151,17 +155,16 @@ export async function recordResult(
client,
recordResultLuaSrc,
recordResultSha,
- [cbKey],
+ [epKey],
args,
- )) as [number, string];
-
- const [ok, state] = raw;
+ )) as [string, number];
- if (ok === 1) {
- return { ok: true, state: "closed" };
- }
+ const [circuitState, stateChanged] = raw;
- return { ok: false, state: state as "opened" | "failed" };
+ return {
+ circuitState: circuitState as CircuitState,
+ stateChanged: stateChanged === 1,
+ };
}
export function resetAdmitSha(): void {
diff --git a/lambdas/https-client-lambda/src/services/record-result.lua b/lambdas/https-client-lambda/src/services/record-result.lua
index 1cc94857..fa42efea 100644
--- a/lambdas/https-client-lambda/src/services/record-result.lua
+++ b/lambdas/https-client-lambda/src/services/record-result.lua
@@ -1,144 +1,169 @@
--- record-result.lua — Records the outcome of a delivery attempt.
+-- record-result.lua — Post-processing: updates sampling and circuit breaker.
--
--- Updates the circuit breaker's error-rate window counters and opens the
--- circuit if the failure rate exceeds the configured threshold.
+-- After processing a batch, this script:
+-- 1. Manages the sliding window (rolling forward as necessary)
+-- 2. Records new attempts and failures (unless fully open)
+-- 3. Interpolates attempt/failure rates using the sliding window
+-- 4. Checks whether to close the circuit (half-open + successes)
+-- 5. Checks whether to open the circuit (closed + threshold exceeded)
--
--- On success:
--- Window counters are left intact. The openedUntil timestamp is preserved
--- while the decay period is still active so that admit.lua can continue
--- computing the linear ramp-up rate. Once the decay period elapses it
--- is zeroed, returning the circuit to a fully clean closed state.
+-- Returns: { circuitState, stateChanged }
--
--- On failure:
--- The failure and attempt counters are incremented. A two-window sliding
--- blend is computed before evaluating the trip condition:
--- slidingAttempts = cbAttempts + cbPrevAttempts * prevWeight
--- slidingFailures = cbFailures + cbPrevFailures * prevWeight
--- where prevWeight decays linearly from 1.0 → 0.0 as the current window ages,
--- so previous-window failures fade out gradually rather than dropping off a cliff.
--- The circuit opens when:
--- • the endpoint is not already open (prevents double-tripping and
--- resetting the cooldown timer prematurely), AND
--- • slidingAttempts >= cbMinAttempts (avoids tripping on statistically
--- insignificant data at cold start or just after a window roll), AND
--- • slidingFailures / slidingAttempts exceeds cbErrorThreshold.
--- On open, all counters (current and previous) are reset to zero so the
--- fresh cooldown window begins with a clean slate ready for recovery.
+-- circuitState: the current state of the circuit after this run
+-- "open" — fully open (during cooldown, no probes)
+-- "half_open" — open but past cooldown (probing)
+-- "closed_recovery" — closed but ramping up (recovery period)
+-- "closed" — closed, running at full rate
--
--- Returns: { ok (0|1), state }
--- state: "closed" | "opened" | "failed"
+-- stateChanged: whether a circuit transition occurred this run
+-- 1 — the circuit opened or closed during this execution
+-- 0 — no state transition
+
+-- Circuit state constants
+local OPEN = "open"
+local HALF_OPEN = "half_open"
+local CLOSED_RECOVERY = "closed_recovery"
+local CLOSED = "closed"
-- Keys
-local cbKey = KEYS[1] -- cb:{endpoint} circuit breaker state hash
+local epKey = KEYS[1] -- ep:{targetId} combined endpoint state hash
-- Arguments
-local now = tonumber(ARGV[1]) or 0 -- current wall-clock time (ms)
-local success = tonumber(ARGV[2]) or 0 -- 1 = success, 0 = failure
-local cooldownMs = tonumber(ARGV[3]) or 0 -- how long the circuit stays open (ms)
-local decayPeriodMs = tonumber(ARGV[4]) or 0 -- ramp-up window after circuit closes (ms)
-local cbErrorThreshold = tonumber(ARGV[5]) or 0 -- error-rate fraction that trips the circuit (e.g. 0.5)
-local cbMinAttempts = tonumber(ARGV[6]) or 0 -- minimum samples before the circuit can trip
-local cbWindowPeriodMs = tonumber(ARGV[7]) or 0 -- error-rate sliding window duration (ms)
-
--- TTL policy: keep circuit breaker state alive for at least the cooldown
--- duration plus a buffer so the decay period remains visible after a close.
-local cbTtlSeconds = math.ceil(cooldownMs / 1000) + 60
-
-local function refreshCbExpiry()
- redis.call("EXPIRE", cbKey, cbTtlSeconds)
-end
+local now = tonumber(ARGV[1]) or 0
+local consumedTokens = tonumber(ARGV[2]) or 0
+local processingFailures = tonumber(ARGV[3]) or 0
+local cooldownPeriodMs = tonumber(ARGV[4]) or 0
+local recoveryPeriodMs = tonumber(ARGV[5]) or 0
+local failureThreshold = tonumber(ARGV[6]) or 0
+local minAttempts = tonumber(ARGV[7]) or 0
+local samplePeriodMs = tonumber(ARGV[8]) or 0
--------------------------------------------------------------------------------
-- LOAD CURRENT STATE
--------------------------------------------------------------------------------
-local cb = redis.call("HMGET", cbKey,
- "opened_until_ms", "cb_window_from", "cb_failures", "cb_attempts",
- "cb_prev_failures", "cb_prev_attempts")
-local openedUntil = tonumber(cb[1] or "0")
-local cbWindowFrom = tonumber(cb[2] or "0")
-local cbFailures = tonumber(cb[3] or "0")
-local cbAttempts = tonumber(cb[4] or "0")
-local cbPrevFailures = tonumber(cb[5] or "0")
-local cbPrevAttempts = tonumber(cb[6] or "0")
+local state = redis.call("HMGET", epKey,
+ "is_open", "switched_at",
+ "cur_attempts", "prev_attempts", "cur_failures", "prev_failures",
+ "sample_till")
+local isOpenRaw = state[1]
+local needInit = isOpenRaw == false or isOpenRaw == nil
+local isOpen = needInit or tonumber(isOpenRaw) == 1
+local switchedAt = needInit and 0 or tonumber(state[2] or "0")
+local curAttempts = tonumber(state[3] or "0")
+local prevAttempts = tonumber(state[4] or "0")
+local curFailures = tonumber(state[5] or "0")
+local prevFailures = tonumber(state[6] or "0")
+local sampleTill = tonumber(state[7] or "0")
+
+--------------------------------------------------------------------------------
+-- 1. DETERMINE CIRCUIT SUB-STATE
+--------------------------------------------------------------------------------
--- Every outcome (success or failure) contributes to the error-rate window
-cbAttempts = cbAttempts + 1
+local isHalfOpen = isOpen and now > switchedAt + cooldownPeriodMs
+local isFullyOpen = isOpen and not isHalfOpen
--------------------------------------------------------------------------------
--- SUCCESS — preserve openedUntil during decay, then zero it
---
--- admit.lua uses openedUntil to calculate the linear ramp-up rate while the
--- decay period is active. That timestamp must survive in Redis until the
--- decay period ends. Clearing it prematurely would snap the fill rate back
--- to full immediately rather than ramping gradually.
---------------------------------------------------------------------------------
-
-if success == 1 then
- -- Keep openedUntil only if we are still within the decay window
- local inDecayWindow = openedUntil > 0 and now > openedUntil and (now - openedUntil) < decayPeriodMs
- local preservedOpenedUntil = inDecayWindow and openedUntil or 0
-
- redis.call("HSET", cbKey,
- "opened_until_ms", preservedOpenedUntil,
- "cb_window_from", cbWindowFrom,
- "cb_failures", cbFailures,
- "cb_attempts", cbAttempts,
- "cb_prev_failures", cbPrevFailures,
- "cb_prev_attempts", cbPrevAttempts
- )
- refreshCbExpiry()
- return { 1, "closed" }
+-- 2. MANAGE SLIDING WINDOW
+--------------------------------------------------------------------------------
+
+if sampleTill < now then
+ if sampleTill + samplePeriodMs < now then
+ -- Complete reset — window is too old
+ prevAttempts = 0
+ prevFailures = 0
+ sampleTill = now + samplePeriodMs
+ else
+ -- Promote current to previous
+ prevAttempts = curAttempts
+ prevFailures = curFailures
+ sampleTill = sampleTill + samplePeriodMs
+ end
+ curAttempts = 0
+ curFailures = 0
end
--------------------------------------------------------------------------------
--- FAILURE — increment counter and evaluate whether to open the circuit
---
--- The trip condition is evaluated against a sliding blend of current and
--- previous window counts, not the raw current-window counts alone. This
--- prevents a burst of failures from escaping detection simply because it
--- straddles a window boundary and gets partially discarded by a reset.
---------------------------------------------------------------------------------
-
-cbFailures = cbFailures + 1
-
--- The circuit is already open when openedUntil is set and has not yet elapsed.
--- Guard against double-tripping, which would reset the cooldown timer early.
-local circuitAlreadyOpen = openedUntil > 0 and now < openedUntil
-
--- Blend current and previous window counts.
--- prevWeight decays linearly from 1.0 → 0.0 as the current window ages,
--- so previous-window failures fade out gradually rather than dropping off a cliff.
-local windowElapsed = cbWindowFrom > 0 and (now - cbWindowFrom) or 0
-local hasWindow = cbWindowPeriodMs > 0
-local prevWeight = hasWindow and math.max(0, (cbWindowPeriodMs - windowElapsed) / cbWindowPeriodMs) or 0
-local slidingFailures = cbFailures + cbPrevFailures * prevWeight
-local slidingAttempts = cbAttempts + cbPrevAttempts * prevWeight
-
-if not circuitAlreadyOpen
- and slidingAttempts >= cbMinAttempts -- enough data to be statistically meaningful
- and (slidingFailures / slidingAttempts) > cbErrorThreshold then
- -- Trip the circuit — reset all counters so recovery starts from a clean slate
- redis.call("HSET", cbKey,
- "opened_until_ms", now + cooldownMs,
- "cb_window_from", 0,
- "cb_failures", 0,
- "cb_attempts", 0,
- "cb_prev_failures", 0,
- "cb_prev_attempts", 0
- )
- refreshCbExpiry()
- return { 0, "opened" }
+-- 3. RECORD NEW ATTEMPTS/FAILURES (unless fully open)
+--------------------------------------------------------------------------------
+
+if not isFullyOpen then
+ curAttempts = curAttempts + consumedTokens
+ curFailures = curFailures + processingFailures
end
--- Below the threshold — record the failure but keep the circuit closed
-redis.call("HSET", cbKey,
- "opened_until_ms", openedUntil,
- "cb_window_from", cbWindowFrom,
- "cb_failures", cbFailures,
- "cb_attempts", cbAttempts,
- "cb_prev_failures", cbPrevFailures,
- "cb_prev_attempts", cbPrevAttempts
+--------------------------------------------------------------------------------
+-- 4. INTERPOLATE VALUES
+--------------------------------------------------------------------------------
+
+local weight = (sampleTill - now) / samplePeriodMs
+local attempts = prevAttempts * weight + curAttempts
+local failures = prevFailures * weight + curFailures
+
+--------------------------------------------------------------------------------
+-- 5. CIRCUIT BREAKER LOGIC
+--------------------------------------------------------------------------------
+
+local processingSuccesses = consumedTokens - processingFailures
+local stateChanged = false
+
+-- Close circuit when half-open and there are successes
+if isHalfOpen and processingSuccesses > 0 then
+ isOpen = false
+ switchedAt = now
+ stateChanged = true
+ -- fall through, allow circuit to immediately re-open
+end
+
+-- Open circuit when closed, enough samples, and threshold exceeded
+local hasSampledEnough = attempts >= minAttempts
+if not isOpen and hasSampledEnough and (failures / attempts) > failureThreshold then
+ isOpen = true
+ switchedAt = now
+ curAttempts = 0
+ curFailures = 0
+ prevAttempts = 0
+ prevFailures = 0
+ sampleTill = now + samplePeriodMs
+ stateChanged = true
+end
+
+--------------------------------------------------------------------------------
+-- 6. DETERMINE CURRENT CIRCUIT STATE FOR REPORTING
+--------------------------------------------------------------------------------
+
+local circuitState
+if isOpen then
+ if now > switchedAt + cooldownPeriodMs then
+ circuitState = HALF_OPEN
+ else
+ circuitState = OPEN
+ end
+else
+ if now < switchedAt + recoveryPeriodMs then
+ circuitState = CLOSED_RECOVERY
+ else
+ circuitState = CLOSED
+ end
+end
+
+--------------------------------------------------------------------------------
+-- 7. PERSIST STATE
+--------------------------------------------------------------------------------
+
+redis.call("HSET", epKey,
+ "cur_attempts", curAttempts,
+ "prev_attempts", prevAttempts,
+ "cur_failures", curFailures,
+ "prev_failures", prevFailures,
+ "sample_till", sampleTill
)
-refreshCbExpiry()
-return { 0, "failed" }
+
+if stateChanged then
+ redis.call("HSET", epKey,
+ "is_open", isOpen and 1 or 0,
+ "switched_at", switchedAt
+ )
+end
+
+return { circuitState, stateChanged and 1 or 0 }
diff --git a/lambdas/perf-runner-lambda/package.json b/lambdas/perf-runner-lambda/package.json
index 9f9d01d8..59d7691b 100644
--- a/lambdas/perf-runner-lambda/package.json
+++ b/lambdas/perf-runner-lambda/package.json
@@ -13,13 +13,17 @@
"typecheck": "tsc --noEmit"
},
"dependencies": {
+ "@aws-crypto/sha256-js": "catalog:aws",
"@aws-sdk/client-cloudwatch-logs": "catalog:aws",
"@aws-sdk/client-sqs": "catalog:aws",
+ "@aws-sdk/credential-providers": "catalog:aws",
+ "@smithy/signature-v4": "catalog:aws",
"@nhs-notify-client-callbacks/logger": "workspace:*",
"@nhs-notify-client-callbacks/models": "workspace:*",
- "esbuild": "catalog:tools"
+ "@redis/client": "catalog:app"
},
"devDependencies": {
+ "esbuild": "catalog:tools",
"@tsconfig/node22": "catalog:tools",
"@types/aws-lambda": "catalog:tools",
"@types/jest": "catalog:test",
diff --git a/lambdas/perf-runner-lambda/src/__tests__/cloudwatch.test.ts b/lambdas/perf-runner-lambda/src/__tests__/cloudwatch.test.ts
index 055ac7bc..526de638 100644
--- a/lambdas/perf-runner-lambda/src/__tests__/cloudwatch.test.ts
+++ b/lambdas/perf-runner-lambda/src/__tests__/cloudwatch.test.ts
@@ -1,5 +1,10 @@
import type { CloudWatchLogsClient } from "@aws-sdk/client-cloudwatch-logs";
-import { queryDeliveryMetricsSnapshot, queryMetricsSnapshot } from "cloudwatch";
+import {
+ queryCircuitBreakerSnapshot,
+ queryDeliveryMetricsSnapshot,
+ queryMetricsSnapshot,
+ queryPerClientRateTimeline,
+} from "cloudwatch";
const mockCloudWatchClient = {
send: jest.fn(),
@@ -285,3 +290,373 @@ describe("queryDeliveryMetricsSnapshot", () => {
expect(result).toBeNull();
});
});
+
+describe("queryCircuitBreakerSnapshot", () => {
+ it("returns null when logGroupNames is empty", async () => {
+ const result = await queryCircuitBreakerSnapshot(
+ mockCloudWatchClient,
+ [],
+ 0,
+ 60,
+ );
+
+ expect(result).toBeNull();
+ expect(mockCloudWatchClient.send).not.toHaveBeenCalled();
+ });
+
+ it("returns null when StartQuery returns no queryId", async () => {
+ mockCloudWatchClient.send.mockResolvedValueOnce({} as never);
+
+ const result = await queryCircuitBreakerSnapshot(
+ mockCloudWatchClient,
+ ["/aws/lambda/test-https-client-perf-client-1"],
+ 0,
+ 60,
+ );
+
+ expect(result).toBeNull();
+ });
+
+ it("returns a snapshot with zeroed metrics when the result row is empty", async () => {
+ mockCloudWatchClient.send
+ .mockResolvedValueOnce({ queryId: "qid-cb1" } as never)
+ .mockResolvedValueOnce({ status: "Complete", results: [] } as never);
+
+ const promise = queryCircuitBreakerSnapshot(
+ mockCloudWatchClient,
+ ["/aws/lambda/test-https-client-perf-client-1"],
+ 100,
+ 160,
+ );
+
+ await jest.runAllTimersAsync();
+ const result = await promise;
+
+ expect(result).toMatchObject({
+ intervalStartSec: 100,
+ intervalEndSec: 160,
+ circuitOpenEvents: 0,
+ circuitCloseEvents: 0,
+ admissionDeniedCircuitOpen: 0,
+ admissionDeniedRateLimited: 0,
+ deliveryAttempts: 0,
+ deliverySuccesses: 0,
+ deliveryFailures: 0,
+ deliveryRateLimited: 0,
+ });
+ expect(result?.snapshotAt).toBeGreaterThan(0);
+ });
+
+ it("returns a populated snapshot when query completes successfully", async () => {
+ mockCloudWatchClient.send
+ .mockResolvedValueOnce({ queryId: "qid-cb2" } as never)
+ .mockResolvedValueOnce({
+ status: "Complete",
+ results: [
+ [
+ { field: "circuitOpenEvents", value: "3" },
+ { field: "circuitCloseEvents", value: "2" },
+ { field: "admissionDeniedCircuitOpen", value: "15" },
+ { field: "admissionDeniedRateLimited", value: "8" },
+ { field: "deliveryAttempts", value: "200" },
+ { field: "deliverySuccesses", value: "180" },
+ { field: "deliveryFailures", value: "12" },
+ { field: "deliveryRateLimited", value: "8" },
+ ],
+ ],
+ } as never);
+
+ const promise = queryCircuitBreakerSnapshot(
+ mockCloudWatchClient,
+ ["/aws/lambda/test-https-client-perf-client-1"],
+ 100,
+ 160,
+ );
+
+ await jest.runAllTimersAsync();
+ const result = await promise;
+
+ expect(result).toMatchObject({
+ intervalStartSec: 100,
+ intervalEndSec: 160,
+ circuitOpenEvents: 3,
+ circuitCloseEvents: 2,
+ admissionDeniedCircuitOpen: 15,
+ admissionDeniedRateLimited: 8,
+ deliveryAttempts: 200,
+ deliverySuccesses: 180,
+ deliveryFailures: 12,
+ deliveryRateLimited: 8,
+ });
+ });
+
+ it("sends logGroupNames to StartQuery", async () => {
+ mockCloudWatchClient.send
+ .mockResolvedValueOnce({ queryId: "qid-cb3" } as never)
+ .mockResolvedValueOnce({ status: "Complete", results: [] } as never);
+
+ const logGroups = [
+ "/aws/lambda/test-https-client-perf-client-1",
+ "/aws/lambda/test-https-client-perf-client-2",
+ ];
+
+ const promise = queryCircuitBreakerSnapshot(
+ mockCloudWatchClient,
+ logGroups,
+ 0,
+ 60,
+ );
+
+ await jest.runAllTimersAsync();
+ await promise;
+
+ const startCmd = mockCloudWatchClient.send.mock.calls[0][0] as {
+ input: { logGroupNames: string[] };
+ };
+ expect(startCmd.input.logGroupNames).toEqual(logGroups);
+ });
+
+ it("returns null when the query status is Failed", async () => {
+ mockCloudWatchClient.send
+ .mockResolvedValueOnce({ queryId: "qid-cb4" } as never)
+ .mockResolvedValueOnce({ status: "Failed" } as never);
+
+ const promise = queryCircuitBreakerSnapshot(
+ mockCloudWatchClient,
+ ["/aws/lambda/test-https-client-perf-client-1"],
+ 0,
+ 60,
+ );
+
+ await jest.runAllTimersAsync();
+ const result = await promise;
+
+ expect(result).toBeNull();
+ });
+});
+
+describe("queryPerClientRateTimeline", () => {
+ it("returns empty array when StartQuery returns no queryId", async () => {
+ mockCloudWatchClient.send.mockResolvedValueOnce({} as never);
+
+ const result = await queryPerClientRateTimeline(
+ mockCloudWatchClient,
+ "/aws/lambda/test-https-client-perf-client-1",
+ 0,
+ 60,
+ );
+
+ expect(result).toEqual([]);
+ });
+
+ it("returns empty array when the query status is Failed", async () => {
+ mockCloudWatchClient.send
+ .mockResolvedValueOnce({ queryId: "qid-pcr1" } as never)
+ .mockResolvedValueOnce({ status: "Failed" } as never);
+
+ const promise = queryPerClientRateTimeline(
+ mockCloudWatchClient,
+ "/aws/lambda/test-https-client-perf-client-1",
+ 0,
+ 60,
+ );
+
+ await jest.runAllTimersAsync();
+ const result = await promise;
+
+ expect(result).toEqual([]);
+ });
+
+ it("returns empty array when results are empty", async () => {
+ mockCloudWatchClient.send
+ .mockResolvedValueOnce({ queryId: "qid-pcr2" } as never)
+ .mockResolvedValueOnce({ status: "Complete", results: [] } as never);
+
+ const promise = queryPerClientRateTimeline(
+ mockCloudWatchClient,
+ "/aws/lambda/test-https-client-perf-client-1",
+ 0,
+ 60,
+ );
+
+ await jest.runAllTimersAsync();
+ const result = await promise;
+
+ expect(result).toEqual([]);
+ });
+
+ it("returns empty array when results is undefined", async () => {
+ mockCloudWatchClient.send
+ .mockResolvedValueOnce({ queryId: "qid-pcr2b" } as never)
+ .mockResolvedValueOnce({ status: "Complete" } as never);
+
+ const promise = queryPerClientRateTimeline(
+ mockCloudWatchClient,
+ "/aws/lambda/test-https-client-perf-client-1",
+ 0,
+ 60,
+ );
+
+ await jest.runAllTimersAsync();
+ const result = await promise;
+
+ expect(result).toEqual([]);
+ });
+
+ it("defaults missing fields to zero", async () => {
+ mockCloudWatchClient.send
+ .mockResolvedValueOnce({ queryId: "qid-pcr2c" } as never)
+ .mockResolvedValueOnce({
+ status: "Complete",
+ results: [[{ field: "unknownField", value: "123" }]],
+ } as never);
+
+ const promise = queryPerClientRateTimeline(
+ mockCloudWatchClient,
+ "/aws/lambda/test-https-client-perf-client-1",
+ 0,
+ 60,
+ );
+
+ await jest.runAllTimersAsync();
+ const result = await promise;
+
+ expect(result).toHaveLength(1);
+ expect(result[0].deliveryAttempts).toBe(0);
+ expect(result[0].timestampSec).toBe(
+ Math.floor(new Date("0").getTime() / 1000),
+ );
+ });
+
+ it("returns entries sorted by time bin when query completes", async () => {
+ mockCloudWatchClient.send
+ .mockResolvedValueOnce({ queryId: "qid-pcr3" } as never)
+ .mockResolvedValueOnce({
+ status: "Complete",
+ results: [
+ [
+ { field: "timeBin", value: "2026-04-09 10:00:00.000" },
+ { field: "deliveryAttempts", value: "42" },
+ ],
+ [
+ { field: "timeBin", value: "2026-04-09 10:00:10.000" },
+ { field: "deliveryAttempts", value: "38" },
+ ],
+ ],
+ } as never);
+
+ const promise = queryPerClientRateTimeline(
+ mockCloudWatchClient,
+ "/aws/lambda/test-https-client-perf-client-1",
+ 0,
+ 60,
+ );
+
+ await jest.runAllTimersAsync();
+ const result = await promise;
+
+ expect(result).toHaveLength(2);
+ expect(result[0]).toEqual({
+ timestampSec: Math.floor(
+ new Date("2026-04-09 10:00:00.000").getTime() / 1000,
+ ),
+ deliveryAttempts: 42,
+ });
+ expect(result[1]).toEqual({
+ timestampSec: Math.floor(
+ new Date("2026-04-09 10:00:10.000").getTime() / 1000,
+ ),
+ deliveryAttempts: 38,
+ });
+ });
+
+ it("sends logGroupName to StartQuery", async () => {
+ mockCloudWatchClient.send
+ .mockResolvedValueOnce({ queryId: "qid-pcr4" } as never)
+ .mockResolvedValueOnce({ status: "Complete", results: [] } as never);
+
+ const promise = queryPerClientRateTimeline(
+ mockCloudWatchClient,
+ "/aws/lambda/test-https-client-perf-client-1",
+ 100,
+ 200,
+ );
+
+ await jest.runAllTimersAsync();
+ await promise;
+
+ const startCmd = mockCloudWatchClient.send.mock.calls[0][0] as {
+ input: { logGroupName: string; startTime: number; endTime: number };
+ };
+ expect(startCmd.input.logGroupName).toBe(
+ "/aws/lambda/test-https-client-perf-client-1",
+ );
+ expect(startCmd.input.startTime).toBe(100);
+ expect(startCmd.input.endTime).toBe(200);
+ });
+
+ it("polls until the query becomes Complete", async () => {
+ mockCloudWatchClient.send
+ .mockResolvedValueOnce({ queryId: "qid-pcr5" } as never)
+ .mockResolvedValueOnce({ status: "Running" } as never)
+ .mockResolvedValueOnce({
+ status: "Complete",
+ results: [
+ [
+ { field: "timeBin", value: "2026-04-09 10:00:00.000" },
+ { field: "deliveryAttempts", value: "5" },
+ ],
+ ],
+ } as never);
+
+ const promise = queryPerClientRateTimeline(
+ mockCloudWatchClient,
+ "/aws/lambda/test-https-client-perf-client-1",
+ 0,
+ 60,
+ );
+
+ await jest.runAllTimersAsync();
+ const result = await promise;
+
+ expect(result).toHaveLength(1);
+ expect(result[0].deliveryAttempts).toBe(5);
+ expect(mockCloudWatchClient.send).toHaveBeenCalledTimes(3);
+ });
+
+ it("returns empty array when the query does not complete within the timeout", async () => {
+ mockCloudWatchClient.send
+ .mockResolvedValueOnce({ queryId: "qid-pcr6" } as never)
+ .mockResolvedValue({ status: "Running" } as never);
+
+ const promise = queryPerClientRateTimeline(
+ mockCloudWatchClient,
+ "/aws/lambda/test-https-client-perf-client-1",
+ 0,
+ 60,
+ );
+
+ await jest.advanceTimersByTimeAsync(60_000);
+ const result = await promise;
+
+ expect(result).toEqual([]);
+ });
+
+ it("returns empty array when the query status is Cancelled", async () => {
+ mockCloudWatchClient.send
+ .mockResolvedValueOnce({ queryId: "qid-pcr7" } as never)
+ .mockResolvedValueOnce({ status: "Cancelled" } as never);
+
+ const promise = queryPerClientRateTimeline(
+ mockCloudWatchClient,
+ "/aws/lambda/test-https-client-perf-client-1",
+ 0,
+ 60,
+ );
+
+ await jest.runAllTimersAsync();
+ const result = await promise;
+
+ expect(result).toEqual([]);
+ });
+});
diff --git a/lambdas/perf-runner-lambda/src/__tests__/elasticache.test.ts b/lambdas/perf-runner-lambda/src/__tests__/elasticache.test.ts
new file mode 100644
index 00000000..09846ed3
--- /dev/null
+++ b/lambdas/perf-runner-lambda/src/__tests__/elasticache.test.ts
@@ -0,0 +1,74 @@
+import { flushElastiCache } from "elasticache";
+import type { ElastiCacheDeps } from "types";
+
+const mockConnect = jest.fn().mockResolvedValue(undefined);
+const mockFlushAll = jest.fn().mockResolvedValue("OK");
+const mockDisconnect = jest.fn().mockResolvedValue(undefined);
+let mockIsOpen = true;
+
+jest.mock("@redis/client", () => ({
+ createClient: jest.fn(() => ({
+ connect: mockConnect,
+ flushAll: mockFlushAll,
+ disconnect: mockDisconnect,
+ get isOpen() {
+ return mockIsOpen;
+ },
+ })),
+}));
+
+jest.mock("@smithy/signature-v4", () => ({
+ SignatureV4: jest.fn(() => ({
+ presign: jest.fn().mockResolvedValue({
+ query: {
+ "X-Amz-Algorithm": "AWS4-HMAC-SHA256",
+ "X-Amz-Credential": "test-credential",
+ },
+ }),
+ })),
+}));
+
+jest.mock("@aws-crypto/sha256-js", () => ({
+ Sha256: jest.fn(),
+}));
+
+jest.mock("@aws-sdk/credential-providers", () => ({
+ fromNodeProviderChain: jest.fn(() => ({})),
+}));
+
+const deps: ElastiCacheDeps = {
+ endpoint: "test-cache.example.invalid",
+ cacheName: "test-cache",
+ iamUsername: "test-user",
+ region: "eu-west-2",
+};
+
+beforeEach(() => {
+ jest.clearAllMocks();
+ mockIsOpen = true;
+});
+
+describe("flushElastiCache", () => {
+ it("connects, flushes all keys, and disconnects", async () => {
+ await flushElastiCache(deps);
+
+ expect(mockConnect).toHaveBeenCalledTimes(1);
+ expect(mockFlushAll).toHaveBeenCalledTimes(1);
+ expect(mockDisconnect).toHaveBeenCalledTimes(1);
+ });
+
+ it("disconnects even when flushAll throws", async () => {
+ mockFlushAll.mockRejectedValueOnce(new Error("flush failed"));
+
+ await expect(flushElastiCache(deps)).rejects.toThrow("flush failed");
+ expect(mockDisconnect).toHaveBeenCalledTimes(1);
+ });
+
+ it("skips disconnect when client is not open", async () => {
+ mockIsOpen = false;
+
+ await flushElastiCache(deps);
+
+ expect(mockDisconnect).not.toHaveBeenCalled();
+ });
+});
diff --git a/lambdas/perf-runner-lambda/src/__tests__/index.test.ts b/lambdas/perf-runner-lambda/src/__tests__/index.test.ts
index 1d1a501a..3c33bfd6 100644
--- a/lambdas/perf-runner-lambda/src/__tests__/index.test.ts
+++ b/lambdas/perf-runner-lambda/src/__tests__/index.test.ts
@@ -32,6 +32,7 @@ const mockResult: PerformanceResult = {
phases: [],
metrics: [],
deliveryMetrics: [],
+ circuitBreakerMetrics: [],
};
beforeEach(() => {
@@ -42,6 +43,11 @@ beforeEach(() => {
"/aws/lambda/nhs-dev-callbacks-client-transform-filter";
process.env.DELIVERY_LOG_GROUP_PREFIX =
"/aws/lambda/nhs-dev-callbacks-https-client-";
+ process.env.MOCK_WEBHOOK_LOG_GROUP =
+ "/aws/lambda/nhs-dev-callbacks-mock-webhook";
+ process.env.ELASTICACHE_ENDPOINT = "cache.example.invalid";
+ process.env.ELASTICACHE_CACHE_NAME = "test-cache";
+ process.env.ELASTICACHE_IAM_USERNAME = "test-user";
process.env.AWS_REGION = "eu-west-2";
});
@@ -55,9 +61,17 @@ describe("handler", () => {
queueUrl: "https://sqs.example.invalid/queue",
logGroupName: "/aws/lambda/nhs-dev-callbacks-client-transform-filter",
deliveryLogGroupPrefix: "/aws/lambda/nhs-dev-callbacks-https-client-",
+ mockWebhookLogGroup: "/aws/lambda/nhs-dev-callbacks-mock-webhook",
}),
DEFAULT_SCENARIO,
"test-id",
+ undefined,
+ expect.objectContaining({
+ endpoint: "cache.example.invalid",
+ cacheName: "test-cache",
+ iamUsername: "test-user",
+ region: "eu-west-2",
+ }),
);
});
@@ -73,6 +87,8 @@ describe("handler", () => {
expect.anything(),
customScenario,
"custom-test",
+ undefined,
+ expect.anything(),
);
});
@@ -117,6 +133,38 @@ describe("handler", () => {
}),
DEFAULT_SCENARIO,
"no-prefix-test",
+ undefined,
+ expect.anything(),
+ );
+ });
+
+ it("passes undefined elastiCacheDeps when ElastiCache env vars are missing", async () => {
+ delete process.env.ELASTICACHE_ENDPOINT;
+ delete process.env.ELASTICACHE_CACHE_NAME;
+ delete process.env.ELASTICACHE_IAM_USERNAME;
+
+ await handler({ testId: "no-cache-test" });
+
+ expect(mockRunPerformanceTest).toHaveBeenCalledWith(
+ expect.anything(),
+ DEFAULT_SCENARIO,
+ "no-cache-test",
+ undefined,
+ undefined,
+ );
+ });
+
+ it("passes mockWebhookLogGroup from env var", async () => {
+ await handler({ testId: "webhook-test" });
+
+ expect(mockRunPerformanceTest).toHaveBeenCalledWith(
+ expect.objectContaining({
+ mockWebhookLogGroup: "/aws/lambda/nhs-dev-callbacks-mock-webhook",
+ }),
+ expect.anything(),
+ "webhook-test",
+ undefined,
+ expect.anything(),
);
});
});
diff --git a/lambdas/perf-runner-lambda/src/__tests__/purge.test.ts b/lambdas/perf-runner-lambda/src/__tests__/purge.test.ts
new file mode 100644
index 00000000..60347ef9
--- /dev/null
+++ b/lambdas/perf-runner-lambda/src/__tests__/purge.test.ts
@@ -0,0 +1,116 @@
+import type { SQSClient } from "@aws-sdk/client-sqs";
+import { deriveQueueUrls, purgeQueues } from "purge";
+import type { Scenario } from "types";
+
+const scenario: Scenario = {
+ phases: [{ durationSecs: 1, targetEps: 10 }],
+ eventMix: [
+ {
+ weight: 1,
+ factory: "messageStatus",
+ clientId: "perf-client-1",
+ messageStatus: "DELIVERED",
+ },
+ {
+ weight: 1,
+ factory: "channelStatus",
+ clientId: "perf-client-2",
+ channelStatus: "DELIVERED",
+ },
+ ],
+ metricsIntervalSecs: 5,
+};
+
+const inboundQueueUrl =
+ "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-inbound-event-queue";
+
+describe("deriveQueueUrls", () => {
+ it("derives all queue URLs from the inbound queue URL and scenario", () => {
+ const urls = deriveQueueUrls(inboundQueueUrl, scenario);
+
+ expect(urls).toEqual([
+ "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-inbound-event-queue",
+ "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-inbound-event-dlq-queue",
+ "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-perf-client-1-delivery-queue",
+ "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-perf-client-1-delivery-dlq-queue",
+ "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-perf-client-2-delivery-queue",
+ "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-perf-client-2-delivery-dlq-queue",
+ ]);
+ });
+
+ it("deduplicates client IDs that appear multiple times in eventMix", () => {
+ const duplicateScenario: Scenario = {
+ ...scenario,
+ eventMix: [
+ {
+ weight: 1,
+ factory: "messageStatus",
+ clientId: "perf-client-1",
+ messageStatus: "DELIVERED",
+ },
+ {
+ weight: 1,
+ factory: "channelStatus",
+ clientId: "perf-client-1",
+ channelStatus: "DELIVERED",
+ },
+ ],
+ };
+
+ const urls = deriveQueueUrls(inboundQueueUrl, duplicateScenario);
+
+ expect(urls).toEqual([
+ "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-inbound-event-queue",
+ "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-inbound-event-dlq-queue",
+ "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-perf-client-1-delivery-queue",
+ "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-perf-client-1-delivery-dlq-queue",
+ ]);
+ });
+});
+
+describe("purgeQueues", () => {
+ const mockSend = jest.fn().mockResolvedValue({});
+ const mockSqsClient = { send: mockSend } as unknown as SQSClient;
+
+ beforeEach(() => {
+ jest.clearAllMocks();
+ mockSend.mockResolvedValue({});
+ });
+
+ it("sends a PurgeQueueCommand for each queue URL", async () => {
+ const urls = [
+ "https://sqs.example.invalid/queue-a",
+ "https://sqs.example.invalid/queue-b",
+ ];
+
+ await purgeQueues(mockSqsClient, urls);
+
+ expect(mockSend).toHaveBeenCalledTimes(2);
+ });
+
+ it("ignores NonExistentQueue errors gracefully", async () => {
+ const nonExistentError = Object.assign(new Error("Queue does not exist"), {
+ name: "AWS.SimpleQueueService.NonExistentQueue",
+ });
+ mockSend.mockRejectedValueOnce(nonExistentError);
+
+ await expect(
+ purgeQueues(mockSqsClient, ["https://sqs.example.invalid/missing"]),
+ ).resolves.toBeUndefined();
+ });
+
+ it("rethrows non-NonExistentQueue errors", async () => {
+ const otherError = new Error("Access denied");
+ mockSend.mockRejectedValueOnce(otherError);
+
+ await expect(
+ purgeQueues(mockSqsClient, ["https://sqs.example.invalid/queue"]),
+ ).rejects.toThrow("Access denied");
+ });
+
+ it("handles an empty queue URL list without sending any commands", async () => {
+ await purgeQueues(mockSqsClient, []);
+
+ expect(mockSend).not.toHaveBeenCalled();
+ });
+});
diff --git a/lambdas/perf-runner-lambda/src/__tests__/runner.test.ts b/lambdas/perf-runner-lambda/src/__tests__/runner.test.ts
index 1cf5f3a3..46a0928d 100644
--- a/lambdas/perf-runner-lambda/src/__tests__/runner.test.ts
+++ b/lambdas/perf-runner-lambda/src/__tests__/runner.test.ts
@@ -1,6 +1,7 @@
import type { SQSClient } from "@aws-sdk/client-sqs";
import type { CloudWatchLogsClient } from "@aws-sdk/client-cloudwatch-logs";
import type {
+ CircuitBreakerSnapshot,
DeliveryMetricsSnapshot,
MetricsSnapshot,
PhaseResult,
@@ -10,16 +11,35 @@ import type {
import { defaultSleep, runPerformanceTest } from "runner";
import { generatePhaseLoad } from "sqs";
-import { queryDeliveryMetricsSnapshot, queryMetricsSnapshot } from "cloudwatch";
+import { deriveQueueUrls, purgeQueues } from "purge";
+import { flushElastiCache } from "elasticache";
+import { verifyMockWebhook } from "webhook-verify";
+import {
+ queryCircuitBreakerSnapshot,
+ queryDeliveryMetricsSnapshot,
+ queryMetricsSnapshot,
+ queryPerClientRateTimeline,
+} from "cloudwatch";
jest.mock("sqs");
jest.mock("cloudwatch");
+jest.mock("purge");
+jest.mock("elasticache");
+jest.mock("webhook-verify");
const mockGeneratePhaseLoad = jest.mocked(generatePhaseLoad);
const mockQueryMetricsSnapshot = jest.mocked(queryMetricsSnapshot);
const mockQueryDeliveryMetricsSnapshot = jest.mocked(
queryDeliveryMetricsSnapshot,
);
+const mockQueryCircuitBreakerSnapshot = jest.mocked(
+ queryCircuitBreakerSnapshot,
+);
+const mockQueryPerClientRateTimeline = jest.mocked(queryPerClientRateTimeline);
+const mockDeriveQueueUrls = jest.mocked(deriveQueueUrls);
+const mockPurgeQueues = jest.mocked(purgeQueues);
+const mockFlushElastiCache = jest.mocked(flushElastiCache);
+const mockVerifyMockWebhook = jest.mocked(verifyMockWebhook);
const immediateSleep = jest.fn().mockResolvedValue(undefined);
@@ -46,6 +66,20 @@ const mockDeliverySnapshot: DeliveryMetricsSnapshot = {
p99Ms: 500,
};
+const mockCbSnapshot: CircuitBreakerSnapshot = {
+ snapshotAt: Date.now(),
+ intervalStartSec: 0,
+ intervalEndSec: 60,
+ circuitOpenEvents: 1,
+ circuitCloseEvents: 0,
+ admissionDeniedCircuitOpen: 5,
+ admissionDeniedRateLimited: 3,
+ deliveryAttempts: 100,
+ deliverySuccesses: 92,
+ deliveryFailures: 5,
+ deliveryRateLimited: 3,
+};
+
const scenario: Scenario = {
phases: [{ durationSecs: 1, targetEps: 1000 }],
eventMix: [
@@ -71,6 +105,17 @@ beforeEach(() => {
jest.clearAllMocks();
mockGeneratePhaseLoad.mockResolvedValue(mockPhaseResult);
mockQueryDeliveryMetricsSnapshot.mockResolvedValue(null);
+ mockQueryCircuitBreakerSnapshot.mockResolvedValue(null);
+ mockQueryPerClientRateTimeline.mockResolvedValue([]);
+ mockDeriveQueueUrls.mockReturnValue([
+ "https://sqs.example.invalid/inbound-event-queue",
+ ]);
+ mockPurgeQueues.mockResolvedValue(undefined);
+ mockFlushElastiCache.mockResolvedValue(undefined);
+ mockVerifyMockWebhook.mockResolvedValue({
+ receivedCallbacks: 0,
+ verified: false,
+ });
immediateSleep.mockResolvedValue(undefined);
});
@@ -78,6 +123,7 @@ describe("runPerformanceTest", () => {
it("returns a PerformanceResult with phase results and snapshots from polling and final query", async () => {
mockQueryMetricsSnapshot.mockResolvedValue(mockSnapshot);
mockQueryDeliveryMetricsSnapshot.mockResolvedValue(mockDeliverySnapshot);
+ mockQueryCircuitBreakerSnapshot.mockResolvedValue(mockCbSnapshot);
const result = await runPerformanceTest(
deps,
@@ -92,6 +138,7 @@ describe("runPerformanceTest", () => {
expect(result.phases[0]).toEqual(mockPhaseResult);
expect(result.metrics).toHaveLength(2); // one mid-test, one final
expect(result.deliveryMetrics).toHaveLength(2); // one mid-test, one final
+ expect(result.circuitBreakerMetrics).toHaveLength(2); // one mid-test, one final
expect(result.startedAt).toBeTruthy();
expect(result.completedAt).toBeTruthy();
});
@@ -111,6 +158,7 @@ describe("runPerformanceTest", () => {
expect(result.metrics).toHaveLength(1);
expect(result.metrics[0]).toEqual(mockSnapshot);
expect(result.deliveryMetrics).toHaveLength(0);
+ expect(result.circuitBreakerMetrics).toHaveLength(0);
});
it("produces an empty metrics array when all queries return null", async () => {
@@ -125,6 +173,7 @@ describe("runPerformanceTest", () => {
expect(result.metrics).toHaveLength(0);
expect(result.deliveryMetrics).toHaveLength(0);
+ expect(result.circuitBreakerMetrics).toHaveLength(0);
});
it("runs all phases and collects each result", async () => {
@@ -267,7 +316,9 @@ describe("runPerformanceTest", () => {
);
expect(mockQueryDeliveryMetricsSnapshot).not.toHaveBeenCalled();
+ expect(mockQueryCircuitBreakerSnapshot).not.toHaveBeenCalled();
expect(result.deliveryMetrics).toHaveLength(0);
+ expect(result.circuitBreakerMetrics).toHaveLength(0);
});
it("builds delivery log group names from prefix and event mix client IDs", async () => {
@@ -309,6 +360,273 @@ describe("runPerformanceTest", () => {
expect.any(Number),
);
});
+
+ it("collects circuit breaker metrics when deliveryLogGroupPrefix is set", async () => {
+ mockQueryMetricsSnapshot.mockResolvedValue(mockSnapshot);
+ mockQueryDeliveryMetricsSnapshot.mockResolvedValue(mockDeliverySnapshot);
+ mockQueryCircuitBreakerSnapshot.mockResolvedValue(mockCbSnapshot);
+
+ const result = await runPerformanceTest(
+ deps,
+ scenario,
+ "test-cb-1",
+ immediateSleep,
+ );
+
+ expect(result.circuitBreakerMetrics.length).toBeGreaterThanOrEqual(1);
+ expect(mockQueryCircuitBreakerSnapshot).toHaveBeenCalled();
+ });
+
+ it("returns empty circuitBreakerMetrics when CB queries return null", async () => {
+ mockQueryMetricsSnapshot.mockResolvedValue(mockSnapshot);
+ mockQueryDeliveryMetricsSnapshot.mockResolvedValue(mockDeliverySnapshot);
+ mockQueryCircuitBreakerSnapshot.mockResolvedValue(null);
+
+ const result = await runPerformanceTest(
+ deps,
+ scenario,
+ "test-cb-null",
+ immediateSleep,
+ );
+
+ expect(result.circuitBreakerMetrics).toHaveLength(0);
+ });
+
+ it("uses per-interval windowing for circuit breaker snapshots", async () => {
+ mockQueryMetricsSnapshot.mockResolvedValue(mockSnapshot);
+ mockQueryDeliveryMetricsSnapshot.mockResolvedValue(mockDeliverySnapshot);
+ mockQueryCircuitBreakerSnapshot.mockResolvedValue(mockCbSnapshot);
+
+ let resolvePhase!: (value: PhaseResult) => void;
+ mockGeneratePhaseLoad.mockImplementation(
+ () =>
+ new Promise((r) => {
+ resolvePhase = r;
+ }),
+ );
+
+ let sleepCount = 0;
+ const controlledSleep = jest.fn(async () => {
+ sleepCount += 1;
+ if (sleepCount >= 3) {
+ resolvePhase(mockPhaseResult);
+ }
+ });
+
+ await runPerformanceTest(
+ deps,
+ scenario,
+ "test-cb-interval",
+ controlledSleep,
+ );
+
+ const cbCalls = mockQueryCircuitBreakerSnapshot.mock.calls;
+ expect(cbCalls.length).toBeGreaterThanOrEqual(2);
+ const firstCallEndSec = cbCalls[0][3];
+ const secondCallStartSec = cbCalls[1][2];
+ expect(secondCallStartSec).toBe(firstCallEndSec);
+ });
+
+ it("collects per-client rate timelines when deliveryLogGroupPrefix is set", async () => {
+ mockQueryMetricsSnapshot.mockResolvedValue(mockSnapshot);
+ mockQueryDeliveryMetricsSnapshot.mockResolvedValue(mockDeliverySnapshot);
+ mockQueryPerClientRateTimeline.mockResolvedValue([
+ { timestampSec: 1000, deliveryAttempts: 10 },
+ ]);
+
+ const result = await runPerformanceTest(
+ deps,
+ scenario,
+ "test-pcr-1",
+ immediateSleep,
+ );
+
+ expect(result.perClientRateTimelines).toHaveLength(1);
+ expect(result.perClientRateTimelines![0].clientId).toBe("perf-client-1");
+ expect(result.perClientRateTimelines![0].entries).toHaveLength(1);
+ });
+
+ it("queries each client log group individually for rate timelines", async () => {
+ mockQueryMetricsSnapshot.mockResolvedValue(null);
+ mockQueryPerClientRateTimeline.mockResolvedValue([
+ { timestampSec: 1000, deliveryAttempts: 5 },
+ ]);
+
+ const multiClientScenario: Scenario = {
+ ...scenario,
+ eventMix: [
+ {
+ weight: 1,
+ factory: "messageStatus",
+ clientId: "perf-client-1",
+ messageStatus: "DELIVERED",
+ },
+ {
+ weight: 1,
+ factory: "channelStatus",
+ clientId: "perf-client-2",
+ channelStatus: "DELIVERED",
+ },
+ ],
+ };
+
+ const result = await runPerformanceTest(
+ deps,
+ multiClientScenario,
+ "test-pcr-multi",
+ immediateSleep,
+ );
+
+ expect(mockQueryPerClientRateTimeline).toHaveBeenCalledTimes(2);
+ expect(mockQueryPerClientRateTimeline).toHaveBeenCalledWith(
+ deps.cloudWatchClient,
+ "/aws/lambda/nhs-dev-callbacks-https-client-perf-client-1",
+ expect.any(Number),
+ expect.any(Number),
+ );
+ expect(mockQueryPerClientRateTimeline).toHaveBeenCalledWith(
+ deps.cloudWatchClient,
+ "/aws/lambda/nhs-dev-callbacks-https-client-perf-client-2",
+ expect.any(Number),
+ expect.any(Number),
+ );
+ expect(result.perClientRateTimelines).toHaveLength(2);
+ });
+
+ it("excludes clients with empty rate timelines", async () => {
+ mockQueryMetricsSnapshot.mockResolvedValue(null);
+ mockQueryPerClientRateTimeline
+ .mockResolvedValueOnce([{ timestampSec: 1000, deliveryAttempts: 5 }])
+ .mockResolvedValueOnce([]);
+
+ const multiClientScenario: Scenario = {
+ ...scenario,
+ eventMix: [
+ {
+ weight: 1,
+ factory: "messageStatus",
+ clientId: "perf-client-1",
+ messageStatus: "DELIVERED",
+ },
+ {
+ weight: 1,
+ factory: "channelStatus",
+ clientId: "perf-client-2",
+ channelStatus: "DELIVERED",
+ },
+ ],
+ };
+
+ const result = await runPerformanceTest(
+ deps,
+ multiClientScenario,
+ "test-pcr-filter",
+ immediateSleep,
+ );
+
+ expect(result.perClientRateTimelines).toHaveLength(1);
+ expect(result.perClientRateTimelines![0].clientId).toBe("perf-client-1");
+ });
+
+ it("skips per-client rate timelines when deliveryLogGroupPrefix is undefined", async () => {
+ const depsWithoutPrefix: RunnerDeps = {
+ ...deps,
+ deliveryLogGroupPrefix: undefined,
+ };
+ mockQueryMetricsSnapshot.mockResolvedValue(mockSnapshot);
+
+ const result = await runPerformanceTest(
+ depsWithoutPrefix,
+ scenario,
+ "test-pcr-skip",
+ immediateSleep,
+ );
+
+ expect(mockQueryPerClientRateTimeline).not.toHaveBeenCalled();
+ expect(result.perClientRateTimelines).toHaveLength(0);
+ });
+
+ it("purges queues before and after the test run", async () => {
+ mockQueryMetricsSnapshot.mockResolvedValue(null);
+
+ await runPerformanceTest(deps, scenario, "test-purge", immediateSleep);
+
+ expect(mockDeriveQueueUrls).toHaveBeenCalledWith(deps.queueUrl, scenario);
+ expect(mockPurgeQueues).toHaveBeenCalledTimes(2);
+ });
+
+ it("flushes ElastiCache before and after when deps are provided", async () => {
+ mockQueryMetricsSnapshot.mockResolvedValue(null);
+ const elastiCacheDeps = {
+ endpoint: "cache.example.invalid",
+ cacheName: "test-cache",
+ iamUsername: "test-user",
+ region: "eu-west-2",
+ };
+
+ await runPerformanceTest(
+ deps,
+ scenario,
+ "test-flush",
+ immediateSleep,
+ elastiCacheDeps,
+ );
+
+ expect(mockFlushElastiCache).toHaveBeenCalledTimes(2);
+ expect(mockFlushElastiCache).toHaveBeenCalledWith(elastiCacheDeps);
+ });
+
+ it("skips ElastiCache flush when deps are not provided", async () => {
+ mockQueryMetricsSnapshot.mockResolvedValue(null);
+
+ await runPerformanceTest(deps, scenario, "test-no-flush", immediateSleep);
+
+ expect(mockFlushElastiCache).not.toHaveBeenCalled();
+ });
+
+ it("verifies mock webhook when log group is configured", async () => {
+ const depsWithWebhook: RunnerDeps = {
+ ...deps,
+ mockWebhookLogGroup: "/aws/lambda/test-mock-webhook",
+ };
+ mockQueryMetricsSnapshot.mockResolvedValue(null);
+ mockVerifyMockWebhook.mockResolvedValue({
+ receivedCallbacks: 25,
+ verified: true,
+ });
+
+ const result = await runPerformanceTest(
+ depsWithWebhook,
+ scenario,
+ "test-webhook",
+ immediateSleep,
+ );
+
+ expect(mockVerifyMockWebhook).toHaveBeenCalledWith(
+ depsWithWebhook.cloudWatchClient,
+ "/aws/lambda/test-mock-webhook",
+ expect.any(Number),
+ expect.any(Number),
+ );
+ expect(result.webhookVerification).toEqual({
+ receivedCallbacks: 25,
+ verified: true,
+ });
+ });
+
+ it("omits webhook verification when log group is not configured", async () => {
+ mockQueryMetricsSnapshot.mockResolvedValue(null);
+
+ const result = await runPerformanceTest(
+ deps,
+ scenario,
+ "test-no-webhook",
+ immediateSleep,
+ );
+
+ expect(mockVerifyMockWebhook).not.toHaveBeenCalled();
+ expect(result.webhookVerification).toBeUndefined();
+ });
});
describe("defaultSleep", () => {
diff --git a/lambdas/perf-runner-lambda/src/__tests__/webhook-verify.test.ts b/lambdas/perf-runner-lambda/src/__tests__/webhook-verify.test.ts
new file mode 100644
index 00000000..72c49870
--- /dev/null
+++ b/lambdas/perf-runner-lambda/src/__tests__/webhook-verify.test.ts
@@ -0,0 +1,173 @@
+import type { CloudWatchLogsClient } from "@aws-sdk/client-cloudwatch-logs";
+import { verifyMockWebhook } from "webhook-verify";
+
+const mockSend = jest.fn();
+const mockClient = { send: mockSend } as unknown as CloudWatchLogsClient;
+
+beforeEach(() => {
+ jest.clearAllMocks();
+});
+
+describe("verifyMockWebhook", () => {
+ it("returns verified=true when callbacks are found", async () => {
+ mockSend.mockResolvedValueOnce({ queryId: "q-1" }).mockResolvedValueOnce({
+ status: "Complete",
+ results: [[{ field: "callbackCount", value: "42" }]],
+ });
+
+ const result = await verifyMockWebhook(
+ mockClient,
+ "/aws/lambda/test-mock-webhook",
+ 1000,
+ 2000,
+ );
+
+ expect(result).toEqual({ receivedCallbacks: 42, verified: true });
+ });
+
+ it("returns verified=false when no callbacks are found", async () => {
+ mockSend.mockResolvedValueOnce({ queryId: "q-2" }).mockResolvedValueOnce({
+ status: "Complete",
+ results: [[{ field: "callbackCount", value: "0" }]],
+ });
+
+ const result = await verifyMockWebhook(
+ mockClient,
+ "/aws/lambda/test-mock-webhook",
+ 1000,
+ 2000,
+ );
+
+ expect(result).toEqual({ receivedCallbacks: 0, verified: false });
+ });
+
+ it("returns verified=false when query fails", async () => {
+ mockSend
+ .mockResolvedValueOnce({ queryId: "q-3" })
+ .mockResolvedValueOnce({ status: "Failed" });
+
+ const result = await verifyMockWebhook(
+ mockClient,
+ "/aws/lambda/test-mock-webhook",
+ 1000,
+ 2000,
+ );
+
+ expect(result).toEqual({ receivedCallbacks: 0, verified: false });
+ });
+
+ it("returns verified=false when no queryId is returned", async () => {
+ mockSend.mockResolvedValueOnce({});
+
+ const result = await verifyMockWebhook(
+ mockClient,
+ "/aws/lambda/test-mock-webhook",
+ 1000,
+ 2000,
+ );
+
+ expect(result).toEqual({ receivedCallbacks: 0, verified: false });
+ });
+
+ it("returns verified=false when results are empty", async () => {
+ mockSend.mockResolvedValueOnce({ queryId: "q-4" }).mockResolvedValueOnce({
+ status: "Complete",
+ results: [],
+ });
+
+ const result = await verifyMockWebhook(
+ mockClient,
+ "/aws/lambda/test-mock-webhook",
+ 1000,
+ 2000,
+ );
+
+ expect(result).toEqual({ receivedCallbacks: 0, verified: false });
+ });
+
+ it("returns verified=false when results field is undefined", async () => {
+ mockSend.mockResolvedValueOnce({ queryId: "q-4b" }).mockResolvedValueOnce({
+ status: "Complete",
+ results: undefined,
+ });
+
+ const result = await verifyMockWebhook(
+ mockClient,
+ "/aws/lambda/test-mock-webhook",
+ 1000,
+ 2000,
+ );
+
+ expect(result).toEqual({ receivedCallbacks: 0, verified: false });
+ });
+
+ it("polls until the query completes", async () => {
+ mockSend
+ .mockResolvedValueOnce({ queryId: "q-5" })
+ .mockResolvedValueOnce({ status: "Running" })
+ .mockResolvedValueOnce({
+ status: "Complete",
+ results: [[{ field: "callbackCount", value: "10" }]],
+ });
+
+ const result = await verifyMockWebhook(
+ mockClient,
+ "/aws/lambda/test-mock-webhook",
+ 1000,
+ 2000,
+ );
+
+ expect(result).toEqual({ receivedCallbacks: 10, verified: true });
+ expect(mockSend).toHaveBeenCalledTimes(3);
+ });
+
+ it("returns verified=false when query is cancelled", async () => {
+ mockSend
+ .mockResolvedValueOnce({ queryId: "q-6" })
+ .mockResolvedValueOnce({ status: "Cancelled" });
+
+ const result = await verifyMockWebhook(
+ mockClient,
+ "/aws/lambda/test-mock-webhook",
+ 1000,
+ 2000,
+ );
+
+ expect(result).toEqual({ receivedCallbacks: 0, verified: false });
+ });
+
+ it("returns verified=false when polling times out", async () => {
+ jest.useFakeTimers();
+
+ mockSend.mockResolvedValueOnce({ queryId: "q-7" }).mockImplementation(
+ () =>
+ new Promise((resolve) => {
+ setTimeout(() => resolve({ status: "Running" }), 1000);
+ }),
+ );
+
+ const originalDateNow = Date.now;
+ let callCount = 0;
+ jest.spyOn(Date, "now").mockImplementation(() => {
+ callCount += 1;
+ if (callCount <= 1) return originalDateNow.call(Date);
+ return originalDateNow.call(Date) + 60_000;
+ });
+
+ const promise = verifyMockWebhook(
+ mockClient,
+ "/aws/lambda/test-mock-webhook",
+ 1000,
+ 2000,
+ );
+
+ await jest.advanceTimersByTimeAsync(60_000);
+
+ const result = await promise;
+
+ expect(result).toEqual({ receivedCallbacks: 0, verified: false });
+
+ jest.useRealTimers();
+ jest.restoreAllMocks();
+ });
+});
diff --git a/lambdas/perf-runner-lambda/src/cloudwatch.ts b/lambdas/perf-runner-lambda/src/cloudwatch.ts
index 206bec33..598f5f3f 100644
--- a/lambdas/perf-runner-lambda/src/cloudwatch.ts
+++ b/lambdas/perf-runner-lambda/src/cloudwatch.ts
@@ -3,19 +3,22 @@ import {
GetQueryResultsCommand,
StartQueryCommand,
} from "@aws-sdk/client-cloudwatch-logs";
-import type { DeliveryMetricsSnapshot, MetricsSnapshot } from "types";
+import type {
+ CircuitBreakerSnapshot,
+ DeliveryMetricsSnapshot,
+ MetricsSnapshot,
+ PerClientRateEntry,
+} from "types";
const INSIGHTS_POLL_INTERVAL_MS = 2000;
const INSIGHTS_TIMEOUT_MS = 30_000;
type ResultField = { field?: string; value?: string };
-async function pollQueryResults(
+async function pollInsightsQuery(
client: CloudWatchLogsClient,
queryId: string,
- mapRow: (row: ResultField[]) => T,
-): Promise {
- const zeroResult = mapRow([]);
+): Promise {
const deadline = Date.now() + INSIGHTS_TIMEOUT_MS;
while (Date.now() < deadline) {
@@ -30,15 +33,33 @@ async function pollQueryResults(
}
if (response.status === "Complete") {
- const row = response.results?.[0];
- if (!row) return zeroResult;
- return mapRow(row);
+ return (response.results as ResultField[][]) ?? [];
}
}
return null;
}
+async function pollQueryResults(
+ client: CloudWatchLogsClient,
+ queryId: string,
+ mapRow: (row: ResultField[]) => T,
+): Promise {
+ const rows = await pollInsightsQuery(client, queryId);
+ if (rows === null) return null;
+ return mapRow(rows[0] ?? []);
+}
+
+async function pollAllQueryResults(
+ client: CloudWatchLogsClient,
+ queryId: string,
+ mapRow: (row: ResultField[]) => T,
+): Promise {
+ const rows = await pollInsightsQuery(client, queryId);
+ if (rows === null) return [];
+ return rows.map((row) => mapRow(row));
+}
+
export async function queryMetricsSnapshot(
client: CloudWatchLogsClient,
logGroupName: string,
@@ -108,3 +129,86 @@ export async function queryDeliveryMetricsSnapshot(
};
});
}
+
+export async function queryCircuitBreakerSnapshot(
+ client: CloudWatchLogsClient,
+ logGroupNames: string[],
+ startTimeSec: number,
+ endTimeSec: number,
+): Promise {
+ if (logGroupNames.length === 0) return null;
+
+ const { queryId } = await client.send(
+ new StartQueryCommand({
+ logGroupNames,
+ startTime: startTimeSec,
+ endTime: endTimeSec,
+ queryString: [
+ 'filter msg in ["Circuit breaker opened", "Circuit breaker closed", "Admission denied", "Attempting delivery", "Delivery succeeded", "Transient delivery failure \u2014 requeuing", "Permanent delivery failure \u2014 sending to DLQ", "Rate limited (429)"]',
+ '| stats sum(msg = "Circuit breaker opened") as circuitOpenEvents,',
+ ' sum(msg = "Circuit breaker closed") as circuitCloseEvents,',
+ ' sum(msg = "Admission denied" and reason = "circuit_open") as admissionDeniedCircuitOpen,',
+ ' sum(msg = "Admission denied" and reason = "rate_limited") as admissionDeniedRateLimited,',
+ ' sum(msg = "Attempting delivery") as deliveryAttempts,',
+ ' sum(msg = "Delivery succeeded") as deliverySuccesses,',
+ ' sum(msg in ["Transient delivery failure \u2014 requeuing", "Permanent delivery failure \u2014 sending to DLQ"]) as deliveryFailures,',
+ ' sum(msg = "Rate limited (429)") as deliveryRateLimited',
+ ].join("\n"),
+ }),
+ );
+
+ if (!queryId) return null;
+
+ return pollQueryResults(client, queryId, (row) => {
+ const getField = (name: string): number =>
+ Number(row.find((f) => f.field === name)?.value ?? 0);
+
+ return {
+ snapshotAt: Date.now(),
+ intervalStartSec: startTimeSec,
+ intervalEndSec: endTimeSec,
+ circuitOpenEvents: getField("circuitOpenEvents"),
+ circuitCloseEvents: getField("circuitCloseEvents"),
+ admissionDeniedCircuitOpen: getField("admissionDeniedCircuitOpen"),
+ admissionDeniedRateLimited: getField("admissionDeniedRateLimited"),
+ deliveryAttempts: getField("deliveryAttempts"),
+ deliverySuccesses: getField("deliverySuccesses"),
+ deliveryFailures: getField("deliveryFailures"),
+ deliveryRateLimited: getField("deliveryRateLimited"),
+ };
+ });
+}
+
+const RATE_TIMELINE_BIN_SECONDS = 10;
+
+export async function queryPerClientRateTimeline(
+ client: CloudWatchLogsClient,
+ logGroupName: string,
+ startTimeSec: number,
+ endTimeSec: number,
+): Promise {
+ const { queryId } = await client.send(
+ new StartQueryCommand({
+ logGroupName,
+ startTime: startTimeSec,
+ endTime: endTimeSec,
+ queryString: [
+ 'filter msg in ["Attempting delivery", "Admission denied"]',
+ `| stats sum(msg = "Attempting delivery") as deliveryAttempts by bin(@timestamp, ${RATE_TIMELINE_BIN_SECONDS}s) as timeBin`,
+ "| sort timeBin asc",
+ ].join("\n"),
+ }),
+ );
+
+ if (!queryId) return [];
+
+ return pollAllQueryResults(client, queryId, (row) => {
+ const timeBinStr = row.find((f) => f.field === "timeBin")?.value ?? "0";
+ const timestampSec = Math.floor(new Date(timeBinStr).getTime() / 1000);
+ const deliveryAttempts = Number(
+ row.find((f) => f.field === "deliveryAttempts")?.value ?? 0,
+ );
+
+ return { timestampSec, deliveryAttempts };
+ });
+}
diff --git a/lambdas/perf-runner-lambda/src/elasticache.ts b/lambdas/perf-runner-lambda/src/elasticache.ts
new file mode 100644
index 00000000..8d0b86c6
--- /dev/null
+++ b/lambdas/perf-runner-lambda/src/elasticache.ts
@@ -0,0 +1,52 @@
+import { type RedisClientType, createClient } from "@redis/client";
+import { SignatureV4 } from "@smithy/signature-v4";
+import { Sha256 } from "@aws-crypto/sha256-js";
+import { fromNodeProviderChain } from "@aws-sdk/credential-providers";
+import type { ElastiCacheDeps } from "types";
+
+const TOKEN_EXPIRY_SECONDS = 900;
+
+async function generateIamToken(deps: ElastiCacheDeps): Promise {
+ const signer = new SignatureV4({
+ credentials: fromNodeProviderChain(),
+ region: deps.region,
+ service: "elasticache",
+ sha256: Sha256,
+ });
+
+ const signed = await signer.presign(
+ {
+ protocol: "https:",
+ method: "GET",
+ hostname: deps.cacheName,
+ path: "/",
+ query: { Action: "connect", User: deps.iamUsername },
+ headers: { host: deps.cacheName },
+ },
+ { expiresIn: TOKEN_EXPIRY_SECONDS },
+ );
+
+ const qs = new URLSearchParams(
+ signed.query as Record,
+ ).toString();
+ return `${deps.cacheName}/?${qs}`;
+}
+
+export async function flushElastiCache(deps: ElastiCacheDeps): Promise {
+ const token = await generateIamToken(deps);
+
+ const client: RedisClientType = createClient({
+ url: `rediss://${deps.endpoint}:6379`,
+ username: deps.iamUsername,
+ password: token,
+ });
+
+ try {
+ await client.connect();
+ await client.flushAll();
+ } finally {
+ if (client.isOpen) {
+ await client.disconnect();
+ }
+ }
+}
diff --git a/lambdas/perf-runner-lambda/src/index.ts b/lambdas/perf-runner-lambda/src/index.ts
index a0881866..5974627b 100644
--- a/lambdas/perf-runner-lambda/src/index.ts
+++ b/lambdas/perf-runner-lambda/src/index.ts
@@ -3,7 +3,11 @@ import { SQSClient } from "@aws-sdk/client-sqs";
import { Logger } from "@nhs-notify-client-callbacks/logger";
import { runPerformanceTest } from "runner";
import { DEFAULT_SCENARIO } from "scenario";
-import type { PerfRunnerPayload, PerformanceResult } from "types";
+import type {
+ ElastiCacheDeps,
+ PerfRunnerPayload,
+ PerformanceResult,
+} from "types";
const logger = new Logger();
@@ -16,6 +20,10 @@ export async function handler(
const queueUrl = process.env.INBOUND_QUEUE_URL;
const logGroupName = process.env.TRANSFORM_FILTER_LOG_GROUP;
const deliveryLogGroupPrefix = process.env.DELIVERY_LOG_GROUP_PREFIX;
+ const mockWebhookLogGroup = process.env.MOCK_WEBHOOK_LOG_GROUP;
+ const elasticacheEndpoint = process.env.ELASTICACHE_ENDPOINT;
+ const elasticacheCacheName = process.env.ELASTICACHE_CACHE_NAME;
+ const elasticacheIamUsername = process.env.ELASTICACHE_IAM_USERNAME;
if (!queueUrl) {
throw new Error("Missing required environment variable: INBOUND_QUEUE_URL");
@@ -30,6 +38,16 @@ export async function handler(
const sqsClient = new SQSClient({ region });
const cloudWatchClient = new CloudWatchLogsClient({ region });
+ const elastiCacheDeps: ElastiCacheDeps | undefined =
+ elasticacheEndpoint && elasticacheCacheName && elasticacheIamUsername
+ ? {
+ endpoint: elasticacheEndpoint,
+ cacheName: elasticacheCacheName,
+ iamUsername: elasticacheIamUsername,
+ region,
+ }
+ : undefined;
+
logger.info("Performance test started", { testId });
try {
@@ -40,9 +58,12 @@ export async function handler(
queueUrl,
logGroupName,
deliveryLogGroupPrefix,
+ mockWebhookLogGroup,
},
scenario,
testId,
+ undefined,
+ elastiCacheDeps,
);
logger.info("Performance test completed", { testId });
diff --git a/lambdas/perf-runner-lambda/src/purge.ts b/lambdas/perf-runner-lambda/src/purge.ts
new file mode 100644
index 00000000..5743e9d2
--- /dev/null
+++ b/lambdas/perf-runner-lambda/src/purge.ts
@@ -0,0 +1,40 @@
+import { PurgeQueueCommand, type SQSClient } from "@aws-sdk/client-sqs";
+import type { Scenario } from "types";
+
+export function deriveQueueUrls(
+ inboundQueueUrl: string,
+ scenario: Scenario,
+): string[] {
+ // eslint-disable-next-line sonarjs/null-dereference -- String.replace always returns a string
+ const baseUrl = inboundQueueUrl.replace(/inbound-event-queue$/, "");
+ const clientIds = [...new Set(scenario.eventMix.map((e) => e.clientId))];
+
+ return [
+ inboundQueueUrl,
+ `${baseUrl}inbound-event-dlq-queue`,
+ ...clientIds.flatMap((id) => [
+ `${baseUrl}${id}-delivery-queue`,
+ `${baseUrl}${id}-delivery-dlq-queue`,
+ ]),
+ ];
+}
+
+export async function purgeQueues(
+ client: SQSClient,
+ queueUrls: string[],
+): Promise {
+ const results = await Promise.allSettled(
+ queueUrls.map((url) =>
+ client.send(new PurgeQueueCommand({ QueueUrl: url })),
+ ),
+ );
+
+ for (const result of results) {
+ if (result.status === "rejected") {
+ const error = result.reason as { name?: string };
+ if (error.name !== "AWS.SimpleQueueService.NonExistentQueue") {
+ throw result.reason as Error;
+ }
+ }
+ }
+}
diff --git a/lambdas/perf-runner-lambda/src/runner.ts b/lambdas/perf-runner-lambda/src/runner.ts
index a265e90e..321abc45 100644
--- a/lambdas/perf-runner-lambda/src/runner.ts
+++ b/lambdas/perf-runner-lambda/src/runner.ts
@@ -1,13 +1,25 @@
import type {
+ CircuitBreakerSnapshot,
DeliveryMetricsSnapshot,
+ ElastiCacheDeps,
MetricsSnapshot,
+ PerClientRateTimeline,
PerformanceResult,
PhaseResult,
RunnerDeps,
Scenario,
+ WebhookVerificationResult,
} from "types";
import { generatePhaseLoad } from "sqs";
-import { queryDeliveryMetricsSnapshot, queryMetricsSnapshot } from "cloudwatch";
+import { deriveQueueUrls, purgeQueues } from "purge";
+import { flushElastiCache } from "elasticache";
+import { verifyMockWebhook } from "webhook-verify";
+import {
+ queryCircuitBreakerSnapshot,
+ queryDeliveryMetricsSnapshot,
+ queryMetricsSnapshot,
+ queryPerClientRateTimeline,
+} from "cloudwatch";
const CLOUDWATCH_SETTLING_MS = 60_000;
@@ -25,11 +37,56 @@ function buildDeliveryLogGroupNames(
return [...clientIds].map((id) => `${prefix}${id}`);
}
+async function collectSnapshots(
+ deps: RunnerDeps,
+ deliveryLogGroupNames: string[],
+ startSec: number,
+ endSec: number,
+ cbStartSec: number,
+ out: {
+ snapshots: MetricsSnapshot[];
+ deliverySnapshots: DeliveryMetricsSnapshot[];
+ cbSnapshots: CircuitBreakerSnapshot[];
+ },
+): Promise {
+ const snap = await queryMetricsSnapshot(
+ deps.cloudWatchClient,
+ deps.logGroupName,
+ startSec,
+ endSec,
+ );
+ if (snap !== null) out.snapshots.push(snap);
+
+ if (deliveryLogGroupNames.length > 0) {
+ const deliverySnap = await queryDeliveryMetricsSnapshot(
+ deps.cloudWatchClient,
+ deliveryLogGroupNames,
+ startSec,
+ endSec,
+ );
+ if (deliverySnap !== null) out.deliverySnapshots.push(deliverySnap);
+
+ const cbSnap = await queryCircuitBreakerSnapshot(
+ deps.cloudWatchClient,
+ deliveryLogGroupNames,
+ cbStartSec,
+ endSec,
+ );
+ if (cbSnap !== null) {
+ out.cbSnapshots.push(cbSnap);
+ return endSec;
+ }
+ }
+
+ return cbStartSec;
+}
+
export async function runPerformanceTest(
deps: RunnerDeps,
scenario: Scenario,
testId: string,
sleepFn: (ms: number) => Promise = defaultSleep,
+ elastiCacheDeps?: ElastiCacheDeps,
): Promise {
if (scenario.eventMix.length === 0) {
throw new Error("scenario.eventMix must contain at least one entry");
@@ -49,10 +106,19 @@ export async function runPerformanceTest(
}
const testStartMs = Date.now();
+
+ const queueUrls = deriveQueueUrls(deps.queueUrl, scenario);
+ await purgeQueues(deps.sqsClient, queueUrls);
+ if (elastiCacheDeps) {
+ await flushElastiCache(elastiCacheDeps);
+ }
+
const startedAt = new Date(testStartMs).toISOString();
const phaseResults: PhaseResult[] = [];
const snapshots: MetricsSnapshot[] = [];
const deliverySnapshots: DeliveryMetricsSnapshot[] = [];
+ const cbSnapshots: CircuitBreakerSnapshot[] = [];
+ let lastCbSnapshotSec = Math.floor(testStartMs / 1000);
let stopPolling = false;
const deliveryLogGroupNames = buildDeliveryLogGroupNames(
@@ -60,29 +126,22 @@ export async function runPerformanceTest(
scenario,
);
+ const out = { snapshots, deliverySnapshots, cbSnapshots };
+
const pollLoop = async (): Promise => {
await sleepFn(scenario.metricsIntervalSecs * 1000);
while (!stopPolling) {
const startSec = Math.floor(testStartMs / 1000);
const endSec = Math.floor(Date.now() / 1000);
- const snap = await queryMetricsSnapshot(
- deps.cloudWatchClient,
- deps.logGroupName,
+ lastCbSnapshotSec = await collectSnapshots(
+ deps,
+ deliveryLogGroupNames,
startSec,
endSec,
+ lastCbSnapshotSec,
+ out,
);
- if (snap !== null) snapshots.push(snap);
-
- if (deliveryLogGroupNames.length > 0) {
- const deliverySnap = await queryDeliveryMetricsSnapshot(
- deps.cloudWatchClient,
- deliveryLogGroupNames,
- startSec,
- endSec,
- );
- if (deliverySnap !== null) deliverySnapshots.push(deliverySnap);
- }
if (!stopPolling) {
await sleepFn(scenario.metricsIntervalSecs * 1000);
@@ -110,22 +169,48 @@ export async function runPerformanceTest(
const finalStartSec = Math.floor(testStartMs / 1000);
const finalEndSec = Math.floor(Date.now() / 1000);
- const finalSnap = await queryMetricsSnapshot(
- deps.cloudWatchClient,
- deps.logGroupName,
+ await collectSnapshots(
+ deps,
+ deliveryLogGroupNames,
finalStartSec,
finalEndSec,
+ lastCbSnapshotSec,
+ out,
);
- if (finalSnap !== null) snapshots.push(finalSnap);
- if (deliveryLogGroupNames.length > 0) {
- const finalDeliverySnap = await queryDeliveryMetricsSnapshot(
+ const perClientRateTimelines: PerClientRateTimeline[] = [];
+
+ if (deps.deliveryLogGroupPrefix) {
+ const clientIds = [...new Set(scenario.eventMix.map((e) => e.clientId))];
+ const timelinePromises = clientIds.map(async (clientId) => {
+ const logGroupName = `${deps.deliveryLogGroupPrefix}${clientId}`;
+ const entries = await queryPerClientRateTimeline(
+ deps.cloudWatchClient,
+ logGroupName,
+ finalStartSec,
+ finalEndSec,
+ );
+ return { clientId, entries };
+ });
+ const timelines = await Promise.all(timelinePromises);
+ perClientRateTimelines.push(
+ ...timelines.filter((t) => t.entries.length > 0),
+ );
+ }
+
+ let webhookVerification: WebhookVerificationResult | undefined;
+ if (deps.mockWebhookLogGroup) {
+ webhookVerification = await verifyMockWebhook(
deps.cloudWatchClient,
- deliveryLogGroupNames,
+ deps.mockWebhookLogGroup,
finalStartSec,
finalEndSec,
);
- if (finalDeliverySnap !== null) deliverySnapshots.push(finalDeliverySnap);
+ }
+
+ await purgeQueues(deps.sqsClient, queueUrls);
+ if (elastiCacheDeps) {
+ await flushElastiCache(elastiCacheDeps);
}
return {
@@ -136,5 +221,8 @@ export async function runPerformanceTest(
phases: phaseResults,
metrics: snapshots,
deliveryMetrics: deliverySnapshots,
+ circuitBreakerMetrics: cbSnapshots,
+ perClientRateTimelines,
+ webhookVerification,
};
}
diff --git a/lambdas/perf-runner-lambda/src/types.ts b/lambdas/perf-runner-lambda/src/types.ts
index 5366602d..24df2a50 100644
--- a/lambdas/perf-runner-lambda/src/types.ts
+++ b/lambdas/perf-runner-lambda/src/types.ts
@@ -55,6 +55,35 @@ export type DeliveryMetricsSnapshot = {
p99Ms: number;
};
+export type CircuitBreakerSnapshot = {
+ snapshotAt: number;
+ intervalStartSec: number;
+ intervalEndSec: number;
+ circuitOpenEvents: number;
+ circuitCloseEvents: number;
+ admissionDeniedCircuitOpen: number;
+ admissionDeniedRateLimited: number;
+ deliveryAttempts: number;
+ deliverySuccesses: number;
+ deliveryFailures: number;
+ deliveryRateLimited: number;
+};
+
+export type PerClientRateEntry = {
+ timestampSec: number;
+ deliveryAttempts: number;
+};
+
+export type PerClientRateTimeline = {
+ clientId: string;
+ entries: PerClientRateEntry[];
+};
+
+export type WebhookVerificationResult = {
+ receivedCallbacks: number;
+ verified: boolean;
+};
+
export type PerformanceResult = {
testId: string;
scenario: Scenario;
@@ -63,6 +92,9 @@ export type PerformanceResult = {
phases: PhaseResult[];
metrics: MetricsSnapshot[];
deliveryMetrics: DeliveryMetricsSnapshot[];
+ circuitBreakerMetrics: CircuitBreakerSnapshot[];
+ perClientRateTimelines?: PerClientRateTimeline[];
+ webhookVerification?: WebhookVerificationResult;
};
export type PerfRunnerPayload = {
@@ -76,4 +108,12 @@ export type RunnerDeps = {
queueUrl: string;
logGroupName: string;
deliveryLogGroupPrefix?: string;
+ mockWebhookLogGroup?: string;
+};
+
+export type ElastiCacheDeps = {
+ endpoint: string;
+ cacheName: string;
+ iamUsername: string;
+ region: string;
};
diff --git a/lambdas/perf-runner-lambda/src/webhook-verify.ts b/lambdas/perf-runner-lambda/src/webhook-verify.ts
new file mode 100644
index 00000000..77c1fa6d
--- /dev/null
+++ b/lambdas/perf-runner-lambda/src/webhook-verify.ts
@@ -0,0 +1,59 @@
+import {
+ type CloudWatchLogsClient,
+ GetQueryResultsCommand,
+ StartQueryCommand,
+} from "@aws-sdk/client-cloudwatch-logs";
+import type { WebhookVerificationResult } from "types";
+
+const INSIGHTS_POLL_INTERVAL_MS = 2000;
+const INSIGHTS_TIMEOUT_MS = 30_000;
+
+export async function verifyMockWebhook(
+ client: CloudWatchLogsClient,
+ logGroupName: string,
+ startTimeSec: number,
+ endTimeSec: number,
+): Promise {
+ const { queryId } = await client.send(
+ new StartQueryCommand({
+ logGroupName,
+ startTime: startTimeSec,
+ endTime: endTimeSec,
+ queryString: [
+ 'filter msg = "Callback received"',
+ "| stats count(*) as callbackCount",
+ ].join("\n"),
+ }),
+ );
+
+ if (!queryId) {
+ return { receivedCallbacks: 0, verified: false };
+ }
+
+ const deadline = Date.now() + INSIGHTS_TIMEOUT_MS;
+
+ while (Date.now() < deadline) {
+ await new Promise((resolve) => {
+ setTimeout(resolve, INSIGHTS_POLL_INTERVAL_MS);
+ });
+
+ const response = await client.send(new GetQueryResultsCommand({ queryId }));
+
+ if (response.status === "Failed" || response.status === "Cancelled") {
+ return { receivedCallbacks: 0, verified: false };
+ }
+
+ if (response.status === "Complete") {
+ const rows =
+ (response.results as { field?: string; value?: string }[][]) ?? [];
+ const row = rows[0] ?? [];
+ const count = Number(
+ row.find((f) => f.field === "callbackCount")?.value ?? 0,
+ );
+
+ return { receivedCallbacks: count, verified: count > 0 };
+ }
+ }
+
+ return { receivedCallbacks: 0, verified: false };
+}
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index c497eafb..f2b2aa3a 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -450,21 +450,30 @@ importers:
lambdas/perf-runner-lambda:
dependencies:
+ '@aws-crypto/sha256-js':
+ specifier: catalog:aws
+ version: 5.2.0
'@aws-sdk/client-cloudwatch-logs':
specifier: catalog:aws
version: 3.1026.0
'@aws-sdk/client-sqs':
specifier: catalog:aws
version: 3.1026.0
+ '@aws-sdk/credential-providers':
+ specifier: catalog:aws
+ version: 3.1026.0
'@nhs-notify-client-callbacks/logger':
specifier: workspace:*
version: link:../../src/logger
'@nhs-notify-client-callbacks/models':
specifier: workspace:*
version: link:../../src/models
- esbuild:
- specifier: catalog:tools
- version: 0.28.0
+ '@redis/client':
+ specifier: catalog:app
+ version: 1.6.1
+ '@smithy/signature-v4':
+ specifier: catalog:aws
+ version: 5.3.13
devDependencies:
'@tsconfig/node22':
specifier: catalog:tools
@@ -478,6 +487,9 @@ importers:
'@types/node':
specifier: catalog:tools
version: 25.6.0
+ esbuild:
+ specifier: catalog:tools
+ version: 0.28.0
eslint:
specifier: catalog:lint
version: 9.39.4(jiti@2.6.1)