diff --git a/.github/workflows/cicd-1-pull-request.yaml b/.github/workflows/cicd-1-pull-request.yaml index aa5a82bf..3697908c 100644 --- a/.github/workflows/cicd-1-pull-request.yaml +++ b/.github/workflows/cicd-1-pull-request.yaml @@ -173,12 +173,36 @@ jobs: --overrideProjectName "nhs" \ --overrideRoleName "nhs-main-acct-client-callbacks-github-deploy" \ --overrides "branch_name=${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}" + pr-create-dynamic-environment-clients: + name: Create Dynamic Environment (clients) + needs: [metadata, pr-create-dynamic-environment] + runs-on: ubuntu-latest + if: needs.metadata.outputs.does_pull_request_exist == 'true' && github.ref != 'refs/heads/main' + env: + APP_CLIENT_ID: ${{ secrets.APP_CLIENT_ID }} + APP_PEM_FILE: ${{ secrets.APP_PEM_FILE }} + steps: + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + - name: Trigger dynamic environment creation (cbc) + shell: bash + run: | + .github/scripts/dispatch_internal_repo_workflow.sh \ + --infraRepoName "$(echo ${{ github.repository }} | cut -d'/' -f2)" \ + --releaseVersion "${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}" \ + --targetWorkflow "dispatch-deploy-dynamic-env.yaml" \ + --targetEnvironment "pr${{ needs.metadata.outputs.pr_number }}" \ + --targetComponent "cbc" \ + --targetAccountGroup "nhs-notify-client-callbacks-dev" \ + --terraformAction "apply" \ + --overrideProjectName "nhs" \ + --overrideRoleName "nhs-main-acct-client-callbacks-github-deploy" \ + --overrides "branch_name=${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}" acceptance-stage: # Recommended maximum execution time is 10 minutes name: "Acceptance stage" - needs: [metadata, build-stage, pr-create-dynamic-environment] + needs: [metadata, build-stage, pr-create-dynamic-environment, pr-create-dynamic-environment-clients] uses: ./.github/workflows/stage-4-acceptance.yaml if: >- - contains(fromJSON('["success", "skipped"]'), needs.pr-create-dynamic-environment.result) && + contains(fromJSON('["success", "skipped"]'), needs.pr-create-dynamic-environment-clients.result) && (needs.metadata.outputs.does_pull_request_exist == 'true' || (github.event_name == 'pull_request' && (github.event.action == 'opened' || github.event.action == 'reopened')) || (github.event_name == 'push' && github.ref == 'refs/heads/main')) with: build_datetime: "${{ needs.metadata.outputs.build_datetime }}" diff --git a/.github/workflows/pr_closed.yml b/.github/workflows/pr_closed.yml index 42e61428..a0b2c396 100644 --- a/.github/workflows/pr_closed.yml +++ b/.github/workflows/pr_closed.yml @@ -46,7 +46,7 @@ jobs: strategy: max-parallel: 1 matrix: - component: [callbacks] + component: [callbacks, cbc] steps: - name: Checkout repository diff --git a/.github/workflows/pr_destroy_dynamic_env.yml b/.github/workflows/pr_destroy_dynamic_env.yml index 67abd292..350d5316 100644 --- a/.github/workflows/pr_destroy_dynamic_env.yml +++ b/.github/workflows/pr_destroy_dynamic_env.yml @@ -19,6 +19,22 @@ jobs: steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + - name: Trigger dynamic environment destroy (cbc) + env: + APP_PEM_FILE: ${{ secrets.APP_PEM_FILE }} + APP_CLIENT_ID: ${{ secrets.APP_CLIENT_ID }} + shell: bash + run: | + .github/scripts/dispatch_internal_repo_workflow.sh \ + --infraRepoName "$(echo ${{ github.repository }} | cut -d'/' -f2)" \ + --releaseVersion "${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}" \ + --targetWorkflow "dispatch-deploy-dynamic-env.yaml" \ + --targetEnvironment "pr${{ github.event.number }}" \ + --targetComponent "cbc" \ + --targetAccountGroup "nhs-notify-client-callbacks-dev" \ + --terraformAction "destroy" \ + --overrideProjectName "nhs" \ + --overrideRoleName "nhs-main-acct-client-callbacks-github-deploy" \ - name: Trigger dynamic environment destroy env: APP_PEM_FILE: ${{ secrets.APP_PEM_FILE }} diff --git a/.github/workflows/release_created.yml b/.github/workflows/release_created.yml index 329282ae..d29c9a66 100644 --- a/.github/workflows/release_created.yml +++ b/.github/workflows/release_created.yml @@ -22,7 +22,7 @@ jobs: strategy: max-parallel: 1 matrix: - component: [callbacks] + component: [callbacks, cbc] steps: - name: Checkout repository diff --git a/infrastructure/terraform/components/callbacks-clients/README.md b/infrastructure/terraform/components/callbacks-clients/README.md new file mode 100644 index 00000000..df8c1f5c --- /dev/null +++ b/infrastructure/terraform/components/callbacks-clients/README.md @@ -0,0 +1,19 @@ + + + + +## Requirements + +No requirements. +## Inputs + +No inputs. +## Modules + +No modules. +## Outputs + +No outputs. + + + diff --git a/infrastructure/terraform/components/callbacks-clients/locals.tf b/infrastructure/terraform/components/callbacks-clients/locals.tf new file mode 100644 index 00000000..ee33b2ba --- /dev/null +++ b/infrastructure/terraform/components/callbacks-clients/locals.tf @@ -0,0 +1,57 @@ +locals { + aws_lambda_functions_dir_path = "../../../../lambdas" + + clients_dir_path = "${path.module}/../../modules/clients" + + config_clients = merge([ + for filename in fileset(local.clients_dir_path, "*.json") : { + (replace(filename, ".json", "")) = jsondecode(file("${local.clients_dir_path}/${filename}")) + } + ]...) + + # When deploying mock clients, replace sentinel placeholder values with the mock webhook URL and API key. + # Only used for S3 object content — must not be used as a for_each source (contains apply-time values). + enriched_mock_config_clients = var.deploy_mock_clients ? { + for client_id, client in local.config_clients : + client_id => merge(client, { + targets = [ + for target in try(client.targets, []) : + merge(target, { + invocationEndpoint = "https://${local.callbacks.mock_webhook_alb_dns_name}/${target.targetId}" + apiKey = merge(target.apiKey, { headerValue = local.callbacks.mock_webhook_api_key }) + delivery = merge(try(target.delivery, {}), { + mtls = merge(try(target.delivery.mtls, {}), { + certPinning = merge(try(target.delivery.mtls.certPinning, {}), try(target.delivery.mtls.certPinning.enabled, false) ? { + spkiHash = local.callbacks.mock_server_spki_hash + } : {}) + }) + }) + }) + ] + }) + } : local.config_clients + + client_subscriptions = { + for client_id, data in local.config_clients : + client_id => { + for subscription in try(data.subscriptions, []) : + subscription.subscriptionId => { + subscription_id = subscription.subscriptionId + target_ids = try(subscription.targetIds, []) + } + } + } + + client_subscription_targets = { + for client_id, data in local.config_clients : + client_id => merge([ + for subscription in try(data.subscriptions, []) : { + for target_id in try(subscription.targetIds, []) : + "${subscription.subscriptionId}-${target_id}" => { + subscription_id = subscription.subscriptionId + target_id = target_id + } + } + ]...) + } +} diff --git a/infrastructure/terraform/components/callbacks/README.md b/infrastructure/terraform/components/callbacks/README.md index 02804698..c725fe42 100644 --- a/infrastructure/terraform/components/callbacks/README.md +++ b/infrastructure/terraform/components/callbacks/README.md @@ -50,7 +50,6 @@ | Name | Source | Version | |------|--------|---------| | [client\_config\_bucket](#module\_client\_config\_bucket) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/3.0.7/terraform-s3bucket.zip | n/a | -| [client\_delivery](#module\_client\_delivery) | ../../modules/client-delivery | n/a | | [client\_transform\_filter\_lambda](#module\_client\_transform\_filter\_lambda) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/3.0.7/terraform-lambda.zip | n/a | | [kms](#module\_kms) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/3.0.7/terraform-kms.zip | n/a | | [mock\_webhook\_lambda](#module\_mock\_webhook\_lambda) | https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/3.0.7/terraform-lambda.zip | n/a | @@ -61,8 +60,28 @@ | Name | Description | |------|-------------| +| [applications\_map\_parameter\_name](#output\_applications\_map\_parameter\_name) | SSM Parameter Store path for the clientId-to-applicationData map | +| [client\_config\_bucket](#output\_client\_config\_bucket) | S3 bucket name for client subscription configuration | +| [client\_config\_bucket\_arn](#output\_client\_config\_bucket\_arn) | S3 bucket ARN for client subscription configuration | | [deployment](#output\_deployment) | Deployment details used for post-deployment scripts | +| [elasticache\_cache\_name](#output\_elasticache\_cache\_name) | ElastiCache cache name for SigV4 token presigning | +| [elasticache\_endpoint](#output\_elasticache\_endpoint) | ElastiCache Serverless endpoint address | +| [elasticache\_iam\_username](#output\_elasticache\_iam\_username) | IAM username for ElastiCache authentication | +| [event\_bus\_name](#output\_event\_bus\_name) | EventBridge bus name for client subscription rules | +| [kms\_key\_arn](#output\_kms\_key\_arn) | KMS key ARN for encryption at rest | +| [lambda\_s3\_bucket](#output\_lambda\_s3\_bucket) | S3 bucket for Lambda function artefacts | +| [lambda\_security\_group\_id](#output\_lambda\_security\_group\_id) | Security group ID for per-client HTTPS Client Lambda functions | +| [log\_destination\_arn](#output\_log\_destination\_arn) | Firehose destination ARN for log forwarding | +| [log\_subscription\_role\_arn](#output\_log\_subscription\_role\_arn) | IAM role ARN for CloudWatch log subscription | +| [mock\_server\_spki\_hash](#output\_mock\_server\_spki\_hash) | Base64 SHA-256 SPKI hash of the mock server certificate | +| [mock\_webhook\_alb\_dns\_name](#output\_mock\_webhook\_alb\_dns\_name) | DNS name of the mock webhook ALB (dev/test only) | +| [mock\_webhook\_api\_key](#output\_mock\_webhook\_api\_key) | API key for the mock webhook endpoint (dev/test only) | | [mock\_webhook\_lambda\_log\_group\_name](#output\_mock\_webhook\_lambda\_log\_group\_name) | CloudWatch log group name for mock webhook lambda (for integration test queries) | +| [mtls\_cert\_secret\_arn](#output\_mtls\_cert\_secret\_arn) | Secrets Manager ARN for the shared mTLS client certificate | +| [mtls\_test\_ca\_s3\_key](#output\_mtls\_test\_ca\_s3\_key) | S3 key for dev CA certificate PEM bundle | +| [mtls\_test\_cert\_s3\_bucket](#output\_mtls\_test\_cert\_s3\_bucket) | S3 bucket for dev mTLS test certificates | +| [mtls\_test\_cert\_s3\_key](#output\_mtls\_test\_cert\_s3\_key) | S3 key for dev mTLS test certificate bundle | +| [vpc\_subnet\_ids](#output\_vpc\_subnet\_ids) | VPC subnet IDs for Lambda execution | diff --git a/infrastructure/terraform/components/callbacks/locals.tf b/infrastructure/terraform/components/callbacks/locals.tf index 68129a5b..ef474000 100644 --- a/infrastructure/terraform/components/callbacks/locals.tf +++ b/infrastructure/terraform/components/callbacks/locals.tf @@ -5,65 +5,10 @@ locals { root_domain_name = "${var.environment}.${local.acct.route53_zone_names["client-callbacks"]}" # e.g. [main|dev|abxy0].smsnudge.[dev|nonprod|prod].nhsnotify.national.nhs.uk root_domain_id = local.acct.route53_zone_ids["client-callbacks"] - clients_dir_path = "${path.module}/../../modules/clients" - - config_clients = merge([ - for filename in fileset(local.clients_dir_path, "*.json") : { - (replace(filename, ".json", "")) = jsondecode(file("${local.clients_dir_path}/${filename}")) - } - ]...) - # SPKI hash of the mock webhook server certificate for cert-pinning enrichment. # Computed via external data source because Terraform cannot SHA-256 hash raw binary (DER) data natively. mock_server_spki_hash = var.deploy_mock_clients ? data.external.mock_server_spki_hash[0].result.hash : "" - # When deploying mock clients, replace sentinel placeholder values with the mock webhook URL and API key. - # Only used for S3 object content — must not be used as a for_each source (contains apply-time values). - enriched_mock_config_clients = var.deploy_mock_clients ? { - for client_id, client in local.config_clients : - client_id => merge(client, { - targets = [ - for target in try(client.targets, []) : - merge(target, { - invocationEndpoint = "https://${aws_lb.mock_webhook_mtls[0].dns_name}/${target.targetId}" - apiKey = merge(target.apiKey, { headerValue = random_password.mock_webhook_api_key[0].result }) - delivery = merge(try(target.delivery, {}), { - mtls = merge(try(target.delivery.mtls, {}), { - certPinning = merge(try(target.delivery.mtls.certPinning, {}), try(target.delivery.mtls.certPinning.enabled, false) ? { - spkiHash = local.mock_server_spki_hash - } : {}) - }) - }) - }) - ] - }) - } : local.config_clients - - - client_subscriptions = { - for client_id, data in local.config_clients : - client_id => { - for subscription in try(data.subscriptions, []) : - subscription.subscriptionId => { - subscription_id = subscription.subscriptionId - target_ids = try(subscription.targetIds, []) - } - } - } - - client_subscription_targets = { - for client_id, data in local.config_clients : - client_id => merge([ - for subscription in try(data.subscriptions, []) : { - for target_id in try(subscription.targetIds, []) : - "${subscription.subscriptionId}-${target_id}" => { - subscription_id = subscription.subscriptionId - target_id = target_id - } - } - ]...) - } - applications_map_parameter_name = coalesce(var.applications_map_parameter_name, "/${var.project}/${var.environment}/${var.component}/applications-map") client_config_bucket_arn = "arn:aws:s3:::${var.project}-${var.aws_account_id}-${var.region}-${var.environment}-${var.component}-subscription-config" diff --git a/infrastructure/terraform/components/callbacks/module_client_delivery.tf b/infrastructure/terraform/components/callbacks/module_client_delivery.tf deleted file mode 100644 index ebc2e9e1..00000000 --- a/infrastructure/terraform/components/callbacks/module_client_delivery.tf +++ /dev/null @@ -1,46 +0,0 @@ -module "client_delivery" { - source = "../../modules/client-delivery" - for_each = local.config_clients - - project = var.project - aws_account_id = var.aws_account_id - region = var.region - component = var.component - environment = var.environment - group = var.group - - client_id = each.key - client_bus_name = aws_cloudwatch_event_bus.main.name - kms_key_arn = module.kms.key_arn - - subscriptions = local.client_subscriptions[each.key] - subscription_targets = local.client_subscription_targets[each.key] - - client_config_bucket = module.client_config_bucket.bucket - client_config_bucket_arn = module.client_config_bucket.arn - - applications_map_parameter_name = local.applications_map_parameter_name - - lambda_s3_bucket = local.acct.s3_buckets["lambda_function_artefacts"]["id"] - lambda_code_base_path = local.aws_lambda_functions_dir_path - - force_lambda_code_deploy = var.force_lambda_code_deploy - log_level = var.log_level - log_retention_in_days = var.log_retention_in_days - enable_xray_tracing = var.enable_xray_tracing - - log_destination_arn = local.log_destination_arn - log_subscription_role_arn = local.acct.log_subscription_role_arn - - elasticache_endpoint = aws_elasticache_serverless_cache.delivery_state.endpoint[0].address - elasticache_cache_name = aws_elasticache_serverless_cache.delivery_state.name - elasticache_iam_username = "${var.project}-${var.environment}-${var.component}-elasticache-user" - - mtls_cert_secret_arn = var.mtls_cert_secret_arn - mtls_test_cert_s3_bucket = var.deploy_mock_clients ? module.mtls_test_certs_bucket[0].bucket : "" - mtls_test_cert_s3_key = local.mtls_test_cert_s3_key # gitleaks:allow - mtls_test_ca_s3_key = local.mtls_test_ca_s3_key # gitleaks:allow - - vpc_subnet_ids = try(local.acct.private_subnets[local.bc_name], []) - lambda_security_group_id = aws_security_group.https_client_lambda.id -} diff --git a/infrastructure/terraform/components/callbacks/module_kms.tf b/infrastructure/terraform/components/callbacks/module_kms.tf index 327b5641..778d0e3d 100644 --- a/infrastructure/terraform/components/callbacks/module_kms.tf +++ b/infrastructure/terraform/components/callbacks/module_kms.tf @@ -66,7 +66,8 @@ data "aws_iam_policy_document" "kms" { values = [ "arn:aws:sqs:${var.region}:${var.aws_account_id}:${var.project}-${var.environment}-callbacks-inbound-event-queue", "arn:aws:sqs:${var.region}:${var.aws_account_id}:${var.project}-${var.environment}-callbacks-inbound-event-dlq", - "arn:aws:sqs:${var.region}:${var.aws_account_id}:${var.project}-${var.environment}-callbacks-*-dlq-queue" #wildcard here so that DLQs for clients can also use this key + "arn:aws:sqs:${var.region}:${var.aws_account_id}:${var.project}-${var.environment}-cbc-*-delivery-queue", + "arn:aws:sqs:${var.region}:${var.aws_account_id}:${var.project}-${var.environment}-cbc-*-delivery-dlq-queue", ] } } diff --git a/infrastructure/terraform/components/callbacks/module_perf_runner_lambda.tf b/infrastructure/terraform/components/callbacks/module_perf_runner_lambda.tf index 424294a8..4795bc8d 100644 --- a/infrastructure/terraform/components/callbacks/module_perf_runner_lambda.tf +++ b/infrastructure/terraform/components/callbacks/module_perf_runner_lambda.tf @@ -39,7 +39,17 @@ module "perf_runner_lambda" { ENVIRONMENT = var.environment INBOUND_QUEUE_URL = module.sqs_inbound_event.sqs_queue_url TRANSFORM_FILTER_LOG_GROUP = module.client_transform_filter_lambda.cloudwatch_log_group_name - DELIVERY_LOG_GROUP_PREFIX = "/aws/lambda/${local.csi}-https-client-" + DELIVERY_LOG_GROUP_PREFIX = "/aws/lambda/${var.project}-${var.environment}-cbc-https-client-" + DELIVERY_QUEUE_URL_PREFIX = "https://sqs.${var.region}.amazonaws.com/${var.aws_account_id}/${var.project}-${var.environment}-cbc-" + MOCK_WEBHOOK_LOG_GROUP = var.deploy_mock_clients ? module.mock_webhook_lambda[0].cloudwatch_log_group_name : "" + ELASTICACHE_ENDPOINT = aws_elasticache_serverless_cache.delivery_state.endpoint[0].address + ELASTICACHE_CACHE_NAME = aws_elasticache_serverless_cache.delivery_state.name + ELASTICACHE_IAM_USERNAME = "${var.project}-${var.environment}-${var.component}-elasticache-user" + } + + vpc_config = { + subnet_ids = try(local.acct.private_subnets[local.bc_name], []) + security_group_ids = [aws_security_group.https_client_lambda.id] } } @@ -74,6 +84,22 @@ data "aws_iam_policy_document" "perf_runner_lambda" { ] } + statement { + sid = "SQSPurgeQueue" + effect = "Allow" + + actions = [ + "sqs:PurgeQueue", + ] + + resources = [ + module.sqs_inbound_event.sqs_queue_arn, + "${module.sqs_inbound_event.sqs_queue_arn}-dlq", + "arn:aws:sqs:${var.region}:${var.aws_account_id}:${var.project}-${var.environment}-cbc-*-delivery-queue", + "arn:aws:sqs:${var.region}:${var.aws_account_id}:${var.project}-${var.environment}-cbc-*-delivery-dlq-queue", + ] + } + statement { sid = "CloudWatchLogsInsightsQuery" effect = "Allow" @@ -83,10 +109,15 @@ data "aws_iam_policy_document" "perf_runner_lambda" { "logs:StopQuery", ] - resources = [ - "arn:aws:logs:${var.region}:${var.aws_account_id}:log-group:${module.client_transform_filter_lambda.cloudwatch_log_group_name}:*", - "arn:aws:logs:${var.region}:${var.aws_account_id}:log-group:/aws/lambda/${local.csi}-https-client-*", - ] + resources = concat( + [ + "arn:aws:logs:${var.region}:${var.aws_account_id}:log-group:${module.client_transform_filter_lambda.cloudwatch_log_group_name}:*", + "arn:aws:logs:${var.region}:${var.aws_account_id}:log-group:/aws/lambda/${var.project}-${var.environment}-cbc-https-client-*", + ], + var.deploy_mock_clients ? [ + "arn:aws:logs:${var.region}:${var.aws_account_id}:log-group:${module.mock_webhook_lambda[0].cloudwatch_log_group_name}:*", + ] : [], + ) } statement { @@ -99,4 +130,33 @@ data "aws_iam_policy_document" "perf_runner_lambda" { resources = ["*"] } + + statement { + sid = "ElastiCacheConnect" + effect = "Allow" + + actions = [ + "elasticache:Connect", + ] + + resources = [ + aws_elasticache_serverless_cache.delivery_state.arn, + aws_elasticache_user.delivery_state_iam.arn, + ] + } + + statement { + sid = "VPCNetworkInterfacePermissions" + effect = "Allow" + + actions = [ + "ec2:CreateNetworkInterface", + "ec2:DeleteNetworkInterface", + "ec2:DescribeNetworkInterfaces", + ] + + resources = [ + "*", + ] + } } diff --git a/infrastructure/terraform/components/callbacks/outputs.tf b/infrastructure/terraform/components/callbacks/outputs.tf index 1ca00df8..359e77d2 100644 --- a/infrastructure/terraform/components/callbacks/outputs.tf +++ b/infrastructure/terraform/components/callbacks/outputs.tf @@ -22,3 +22,108 @@ output "mock_webhook_lambda_log_group_name" { description = "CloudWatch log group name for mock webhook lambda (for integration test queries)" value = var.deploy_mock_clients ? module.mock_webhook_lambda[0].cloudwatch_log_group_name : null } + +## +# Shared outputs consumed by the cbc component via terraform_remote_state. +## + +output "kms_key_arn" { + description = "KMS key ARN for encryption at rest" + value = module.kms.key_arn +} + +output "event_bus_name" { + description = "EventBridge bus name for client subscription rules" + value = aws_cloudwatch_event_bus.main.name +} + +output "client_config_bucket" { + description = "S3 bucket name for client subscription configuration" + value = module.client_config_bucket.bucket +} + +output "client_config_bucket_arn" { + description = "S3 bucket ARN for client subscription configuration" + value = module.client_config_bucket.arn +} + +output "applications_map_parameter_name" { + description = "SSM Parameter Store path for the clientId-to-applicationData map" + value = local.applications_map_parameter_name +} + +output "lambda_s3_bucket" { + description = "S3 bucket for Lambda function artefacts" + value = local.acct.s3_buckets["lambda_function_artefacts"]["id"] +} + +output "log_destination_arn" { + description = "Firehose destination ARN for log forwarding" + value = local.log_destination_arn +} + +output "log_subscription_role_arn" { + description = "IAM role ARN for CloudWatch log subscription" + value = local.acct.log_subscription_role_arn +} + +output "elasticache_endpoint" { + description = "ElastiCache Serverless endpoint address" + value = aws_elasticache_serverless_cache.delivery_state.endpoint[0].address +} + +output "elasticache_cache_name" { + description = "ElastiCache cache name for SigV4 token presigning" + value = aws_elasticache_serverless_cache.delivery_state.name +} + +output "elasticache_iam_username" { + description = "IAM username for ElastiCache authentication" + value = "${var.project}-${var.environment}-${var.component}-elasticache-user" +} + +output "lambda_security_group_id" { + description = "Security group ID for per-client HTTPS Client Lambda functions" + value = aws_security_group.https_client_lambda.id +} + +output "vpc_subnet_ids" { + description = "VPC subnet IDs for Lambda execution" + value = try(local.acct.private_subnets[local.bc_name], []) +} + +output "mtls_cert_secret_arn" { + description = "Secrets Manager ARN for the shared mTLS client certificate" + value = var.mtls_cert_secret_arn +} + +output "mtls_test_cert_s3_bucket" { + description = "S3 bucket for dev mTLS test certificates" + value = var.deploy_mock_clients ? module.mtls_test_certs_bucket[0].bucket : "" +} + +output "mtls_test_cert_s3_key" { + description = "S3 key for dev mTLS test certificate bundle" + value = local.mtls_test_cert_s3_key +} + +output "mtls_test_ca_s3_key" { + description = "S3 key for dev CA certificate PEM bundle" + value = local.mtls_test_ca_s3_key +} + +output "mock_webhook_alb_dns_name" { + description = "DNS name of the mock webhook ALB (dev/test only)" + value = var.deploy_mock_clients ? aws_lb.mock_webhook_mtls[0].dns_name : "" +} + +output "mock_webhook_api_key" { + description = "API key for the mock webhook endpoint (dev/test only)" + value = var.deploy_mock_clients ? random_password.mock_webhook_api_key[0].result : "" + sensitive = true +} + +output "mock_server_spki_hash" { + description = "Base64 SHA-256 SPKI hash of the mock server certificate" + value = local.mock_server_spki_hash +} diff --git a/infrastructure/terraform/components/callbacks/pre.sh b/infrastructure/terraform/components/callbacks/pre.sh index cac3b745..64cefdfa 100755 --- a/infrastructure/terraform/components/callbacks/pre.sh +++ b/infrastructure/terraform/components/callbacks/pre.sh @@ -1,41 +1,12 @@ # This script is run before the Terraform apply command. -# It ensures dependencies are installed, generates local client config files -# for terraform from S3-held subscriptions, and builds lambda workspaces. +# It ensures dependencies are installed and builds lambda workspaces. script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" # shellcheck source=_paths.sh source "${script_dir}/_paths.sh" -# Resolve deploy_mock_clients and deploy_perf_runner from tfvars; base_path/group/region/environment are in scope from terraform.sh -deploy_mock_clients="false" -deploy_perf_runner="false" -for _tfvar_file in \ - "${base_path}/etc/group_${group}.tfvars" \ - "${base_path}/etc/env_${region}_${environment}.tfvars"; do - if [ -f "${_tfvar_file}" ]; then - _val=$(grep -E '^\s*deploy_mock_clients\s*=' "${_tfvar_file}" | tail -1 | sed 's/.*=\s*//;s/\s*$//') - [ -n "${_val}" ] && deploy_mock_clients="${_val}" - _val=$(grep -E '^\s*deploy_perf_runner\s*=' "${_tfvar_file}" | tail -1 | sed 's/.*=\s*//;s/\s*$//') - [ -n "${_val}" ] && deploy_perf_runner="${_val}" - fi -done -echo "deploy_mock_clients resolved to: ${deploy_mock_clients}" -echo "deploy_perf_runner resolved to: ${deploy_perf_runner}" - pnpm install --frozen-lockfile pnpm run generate-dependencies -"${script_dir}/sync-client-config.sh" - -if [ "${deploy_mock_clients}" == "true" ]; then - cp "${bounded_context_root}/tests/integration/fixtures/subscriptions/"*.json "${clients_dir}/" - echo "Copied mock client subscription config fixtures into clients dir" -fi - -if [ "${deploy_perf_runner}" == "true" ]; then - cp "${bounded_context_root}/tests/performance/fixtures/subscriptions/"*.json "${clients_dir}/" - echo "Copied perf client subscription config fixtures into clients dir" -fi - pnpm run --recursive --if-present lambda-build diff --git a/infrastructure/terraform/components/callbacks/s3_bucket_client_config.tf b/infrastructure/terraform/components/callbacks/s3_bucket_client_config.tf index 9943affd..f3ab9b1e 100644 --- a/infrastructure/terraform/components/callbacks/s3_bucket_client_config.tf +++ b/infrastructure/terraform/components/callbacks/s3_bucket_client_config.tf @@ -1,16 +1,3 @@ -resource "aws_s3_object" "mock_client_config" { - for_each = var.deploy_mock_clients ? toset(keys(local.config_clients)) : toset([]) - - bucket = module.client_config_bucket.id - key = "client_subscriptions/${local.config_clients[each.key].clientId}.json" - content = jsonencode(local.enriched_mock_config_clients[each.key]) - - kms_key_id = module.kms.key_arn - server_side_encryption = "aws:kms" - - content_type = "application/json" -} - module "client_config_bucket" { source = "https://github.com/NHSDigital/nhs-notify-shared-modules/releases/download/3.0.7/terraform-s3bucket.zip" diff --git a/infrastructure/terraform/components/cbc/.tool-versions b/infrastructure/terraform/components/cbc/.tool-versions new file mode 100644 index 00000000..3dd74c72 --- /dev/null +++ b/infrastructure/terraform/components/cbc/.tool-versions @@ -0,0 +1 @@ +terraform 1.10.1 diff --git a/infrastructure/terraform/components/cbc/README.md b/infrastructure/terraform/components/cbc/README.md new file mode 100644 index 00000000..8c604b55 --- /dev/null +++ b/infrastructure/terraform/components/cbc/README.md @@ -0,0 +1,43 @@ + + + + +## Requirements + +| Name | Version | +|------|---------| +| [terraform](#requirement\_terraform) | >= 1.10.1 | +| [aws](#requirement\_aws) | 6.13 | +| [random](#requirement\_random) | ~> 3.0 | +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [aws\_account\_id](#input\_aws\_account\_id) | The AWS Account ID (numeric) | `string` | n/a | yes | +| [component](#input\_component) | The variable encapsulating the name of this component | `string` | `"cbc"` | no | +| [default\_tags](#input\_default\_tags) | A map of default tags to apply to all taggable resources within the component | `map(string)` | `{}` | no | +| [deploy\_mock\_clients](#input\_deploy\_mock\_clients) | Flag to deploy mock client subscription config for integration testing (test/dev environments only) | `bool` | `false` | no | +| [deploy\_perf\_runner](#input\_deploy\_perf\_runner) | Flag to deploy performance test client subscription fixtures | `bool` | `false` | no | +| [enable\_xray\_tracing](#input\_enable\_xray\_tracing) | Enable AWS X-Ray active tracing for Lambda functions | `bool` | `false` | no | +| [environment](#input\_environment) | The name of the tfscaffold environment | `string` | n/a | yes | +| [force\_lambda\_code\_deploy](#input\_force\_lambda\_code\_deploy) | If the lambda package in s3 has the same commit id tag as the terraform build branch, the lambda will not update automatically. Set to True if making changes to Lambda code from on the same commit for example during development | `bool` | `false` | no | +| [group](#input\_group) | The group variables are being inherited from (often synonmous with account short-name) | `string` | n/a | yes | +| [log\_level](#input\_log\_level) | The log level to be used in lambda functions within the component | `string` | `"INFO"` | no | +| [log\_retention\_in\_days](#input\_log\_retention\_in\_days) | The retention period in days for the Cloudwatch Logs events to be retained, default of 0 is indefinite | `number` | `0` | no | +| [parent\_acct\_environment](#input\_parent\_acct\_environment) | Name of the environment responsible for the acct resources used, affects things like DNS zone. Useful for named dev environments | `string` | `"main"` | no | +| [project](#input\_project) | The name of the tfscaffold project | `string` | n/a | yes | +| [region](#input\_region) | The AWS Region | `string` | n/a | yes | +| [token\_bucket\_burst\_capacity](#input\_token\_bucket\_burst\_capacity) | Token bucket burst capacity used by the rate limiter | `number` | `2250` | no | +## Modules + +| Name | Source | Version | +|------|--------|---------| +| [client\_delivery](#module\_client\_delivery) | ../../modules/client-delivery | n/a | +## Outputs + +| Name | Description | +|------|-------------| +| [deployment](#output\_deployment) | Deployment details used for post-deployment scripts | + + + diff --git a/infrastructure/terraform/components/cbc/_paths.sh b/infrastructure/terraform/components/cbc/_paths.sh new file mode 100644 index 00000000..9b9aba00 --- /dev/null +++ b/infrastructure/terraform/components/cbc/_paths.sh @@ -0,0 +1,8 @@ +_paths_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +repo_root="$(cd "${_paths_dir}/../../../.." && pwd)" +clients_dir="${repo_root}/infrastructure/terraform/modules/clients" + +# Follow symlinks to find the real nhs-notify-client-callbacks root +# (repo_root resolves to the workspace root, which differs in CI where the component is symlinked in) +_real_script="$(readlink -f "${BASH_SOURCE[0]}")" +bounded_context_root="$(cd "$(dirname "${_real_script}")/../../../.." && pwd)" diff --git a/infrastructure/terraform/components/cbc/locals.tf b/infrastructure/terraform/components/cbc/locals.tf new file mode 100644 index 00000000..ee33b2ba --- /dev/null +++ b/infrastructure/terraform/components/cbc/locals.tf @@ -0,0 +1,57 @@ +locals { + aws_lambda_functions_dir_path = "../../../../lambdas" + + clients_dir_path = "${path.module}/../../modules/clients" + + config_clients = merge([ + for filename in fileset(local.clients_dir_path, "*.json") : { + (replace(filename, ".json", "")) = jsondecode(file("${local.clients_dir_path}/${filename}")) + } + ]...) + + # When deploying mock clients, replace sentinel placeholder values with the mock webhook URL and API key. + # Only used for S3 object content — must not be used as a for_each source (contains apply-time values). + enriched_mock_config_clients = var.deploy_mock_clients ? { + for client_id, client in local.config_clients : + client_id => merge(client, { + targets = [ + for target in try(client.targets, []) : + merge(target, { + invocationEndpoint = "https://${local.callbacks.mock_webhook_alb_dns_name}/${target.targetId}" + apiKey = merge(target.apiKey, { headerValue = local.callbacks.mock_webhook_api_key }) + delivery = merge(try(target.delivery, {}), { + mtls = merge(try(target.delivery.mtls, {}), { + certPinning = merge(try(target.delivery.mtls.certPinning, {}), try(target.delivery.mtls.certPinning.enabled, false) ? { + spkiHash = local.callbacks.mock_server_spki_hash + } : {}) + }) + }) + }) + ] + }) + } : local.config_clients + + client_subscriptions = { + for client_id, data in local.config_clients : + client_id => { + for subscription in try(data.subscriptions, []) : + subscription.subscriptionId => { + subscription_id = subscription.subscriptionId + target_ids = try(subscription.targetIds, []) + } + } + } + + client_subscription_targets = { + for client_id, data in local.config_clients : + client_id => merge([ + for subscription in try(data.subscriptions, []) : { + for target_id in try(subscription.targetIds, []) : + "${subscription.subscriptionId}-${target_id}" => { + subscription_id = subscription.subscriptionId + target_id = target_id + } + } + ]...) + } +} diff --git a/infrastructure/terraform/components/cbc/locals_remote_state.tf b/infrastructure/terraform/components/cbc/locals_remote_state.tf new file mode 100644 index 00000000..8fbc867c --- /dev/null +++ b/infrastructure/terraform/components/cbc/locals_remote_state.tf @@ -0,0 +1,59 @@ +locals { + bootstrap = data.terraform_remote_state.bootstrap.outputs + acct = data.terraform_remote_state.acct.outputs + callbacks = data.terraform_remote_state.callbacks.outputs +} + +data "terraform_remote_state" "bootstrap" { + backend = "s3" + + config = { + bucket = local.terraform_state_bucket + + key = format( + "%s/%s/%s/%s/bootstrap.tfstate", + var.project, + var.aws_account_id, + "eu-west-2", + "bootstrap" + ) + + region = "eu-west-2" + } +} + +data "terraform_remote_state" "acct" { + backend = "s3" + + config = { + bucket = local.terraform_state_bucket + + key = format( + "%s/%s/%s/%s/acct.tfstate", + var.project, + var.aws_account_id, + "eu-west-2", + var.parent_acct_environment + ) + + region = "eu-west-2" + } +} + +data "terraform_remote_state" "callbacks" { + backend = "s3" + + config = { + bucket = local.terraform_state_bucket + + key = format( + "%s/%s/%s/%s/callbacks.tfstate", + var.project, + var.aws_account_id, + var.region, + var.environment + ) + + region = var.region + } +} diff --git a/infrastructure/terraform/components/cbc/locals_tfscaffold.tf b/infrastructure/terraform/components/cbc/locals_tfscaffold.tf new file mode 100644 index 00000000..b7cf3217 --- /dev/null +++ b/infrastructure/terraform/components/cbc/locals_tfscaffold.tf @@ -0,0 +1,44 @@ +locals { + terraform_state_bucket = format( + "%s-tfscaffold-%s-%s", + var.project, + var.aws_account_id, + var.region, + ) + + csi = replace( + format( + "%s-%s-%s", + var.project, + var.environment, + var.component, + ), + "_", + "", + ) + + # CSI for use in resources with a global namespace, i.e. S3 Buckets + csi_global = replace( + format( + "%s-%s-%s-%s-%s", + var.project, + var.aws_account_id, + var.region, + var.environment, + var.component, + ), + "_", + "", + ) + + default_tags = merge( + var.default_tags, + { + Project = var.project + Environment = var.environment + Component = var.component + Group = var.group + Name = local.csi + }, + ) +} diff --git a/infrastructure/terraform/components/cbc/module_client_delivery.tf b/infrastructure/terraform/components/cbc/module_client_delivery.tf new file mode 100644 index 00000000..823c0912 --- /dev/null +++ b/infrastructure/terraform/components/cbc/module_client_delivery.tf @@ -0,0 +1,48 @@ +module "client_delivery" { + source = "../../modules/client-delivery" + for_each = local.config_clients + + project = var.project + aws_account_id = var.aws_account_id + region = var.region + component = var.component + environment = var.environment + group = var.group + + client_id = each.key + client_bus_name = local.callbacks.event_bus_name + kms_key_arn = local.callbacks.kms_key_arn + + subscriptions = local.client_subscriptions[each.key] + subscription_targets = local.client_subscription_targets[each.key] + + client_config_bucket = local.callbacks.client_config_bucket + client_config_bucket_arn = local.callbacks.client_config_bucket_arn + + applications_map_parameter_name = local.callbacks.applications_map_parameter_name + + lambda_s3_bucket = local.callbacks.lambda_s3_bucket + lambda_code_base_path = local.aws_lambda_functions_dir_path + + force_lambda_code_deploy = var.force_lambda_code_deploy + log_level = var.log_level + log_retention_in_days = var.log_retention_in_days + enable_xray_tracing = var.enable_xray_tracing + + log_destination_arn = local.callbacks.log_destination_arn + log_subscription_role_arn = local.callbacks.log_subscription_role_arn + + elasticache_endpoint = local.callbacks.elasticache_endpoint + elasticache_cache_name = local.callbacks.elasticache_cache_name + elasticache_iam_username = local.callbacks.elasticache_iam_username + + mtls_cert_secret_arn = local.callbacks.mtls_cert_secret_arn + mtls_test_cert_s3_bucket = local.callbacks.mtls_test_cert_s3_bucket + mtls_test_cert_s3_key = local.callbacks.mtls_test_cert_s3_key # gitleaks:allow + mtls_test_ca_s3_key = local.callbacks.mtls_test_ca_s3_key # gitleaks:allow + + token_bucket_burst_capacity = var.token_bucket_burst_capacity + + vpc_subnet_ids = local.callbacks.vpc_subnet_ids + lambda_security_group_id = local.callbacks.lambda_security_group_id +} diff --git a/infrastructure/terraform/components/cbc/outputs.tf b/infrastructure/terraform/components/cbc/outputs.tf new file mode 100644 index 00000000..c443add1 --- /dev/null +++ b/infrastructure/terraform/components/cbc/outputs.tf @@ -0,0 +1,15 @@ +## +# Deployment details +## + +output "deployment" { + description = "Deployment details used for post-deployment scripts" + value = { + aws_region = var.region + aws_account_id = var.aws_account_id + project = var.project + environment = var.environment + group = var.group + component = var.component + } +} diff --git a/infrastructure/terraform/components/cbc/pre.sh b/infrastructure/terraform/components/cbc/pre.sh new file mode 100644 index 00000000..cd2e8a22 --- /dev/null +++ b/infrastructure/terraform/components/cbc/pre.sh @@ -0,0 +1,40 @@ +# This script is run before the Terraform apply command. +# It syncs client config from S3, copies mock/perf fixtures if needed, and builds lambda workspaces. + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=_paths.sh +source "${script_dir}/_paths.sh" + +# Resolve deploy_mock_clients and deploy_perf_runner from tfvars; base_path/group/region/environment are in scope from terraform.sh +deploy_mock_clients="false" +deploy_perf_runner="false" +for _tfvar_file in \ + "${base_path}/etc/group_${group}.tfvars" \ + "${base_path}/etc/env_${region}_${environment}.tfvars"; do + if [ -f "${_tfvar_file}" ]; then + _val=$(grep -E '^\s*deploy_mock_clients\s*=' "${_tfvar_file}" | tail -1 | sed 's/.*=\s*//;s/\s*$//') + [ -n "${_val}" ] && deploy_mock_clients="${_val}" + _val=$(grep -E '^\s*deploy_perf_runner\s*=' "${_tfvar_file}" | tail -1 | sed 's/.*=\s*//;s/\s*$//') + [ -n "${_val}" ] && deploy_perf_runner="${_val}" + fi +done +echo "deploy_mock_clients resolved to: ${deploy_mock_clients}" +echo "deploy_perf_runner resolved to: ${deploy_perf_runner}" + +pnpm install --frozen-lockfile + +pnpm run generate-dependencies + +"${script_dir}/sync-client-config.sh" + +if [ "${deploy_mock_clients}" == "true" ]; then + cp "${bounded_context_root}/tests/integration/fixtures/subscriptions/"*.json "${clients_dir}/" + echo "Copied mock client subscription config fixtures into clients dir" +fi + +if [ "${deploy_perf_runner}" == "true" ]; then + cp "${bounded_context_root}/tests/performance/fixtures/subscriptions/"*.json "${clients_dir}/" + echo "Copied perf client subscription config fixtures into clients dir" +fi + +pnpm run --recursive --if-present lambda-build diff --git a/infrastructure/terraform/components/cbc/provider_aws.tf b/infrastructure/terraform/components/cbc/provider_aws.tf new file mode 100644 index 00000000..c3ed73bb --- /dev/null +++ b/infrastructure/terraform/components/cbc/provider_aws.tf @@ -0,0 +1,11 @@ +provider "aws" { + region = var.region + + allowed_account_ids = [ + var.aws_account_id, + ] + + default_tags { + tags = local.default_tags + } +} diff --git a/infrastructure/terraform/components/cbc/s3_object_client_config.tf b/infrastructure/terraform/components/cbc/s3_object_client_config.tf new file mode 100644 index 00000000..aa7de6c3 --- /dev/null +++ b/infrastructure/terraform/components/cbc/s3_object_client_config.tf @@ -0,0 +1,12 @@ +resource "aws_s3_object" "mock_client_config" { + for_each = var.deploy_mock_clients ? toset(keys(local.config_clients)) : toset([]) + + bucket = local.callbacks.client_config_bucket + key = "client_subscriptions/${local.config_clients[each.key].clientId}.json" + content = jsonencode(local.enriched_mock_config_clients[each.key]) + + kms_key_id = local.callbacks.kms_key_arn + server_side_encryption = "aws:kms" + + content_type = "application/json" +} diff --git a/infrastructure/terraform/components/callbacks/ssm_parameter_applications_map.tf b/infrastructure/terraform/components/cbc/ssm_parameter_applications_map.tf similarity index 83% rename from infrastructure/terraform/components/callbacks/ssm_parameter_applications_map.tf rename to infrastructure/terraform/components/cbc/ssm_parameter_applications_map.tf index 567647d1..60ba72f3 100644 --- a/infrastructure/terraform/components/callbacks/ssm_parameter_applications_map.tf +++ b/infrastructure/terraform/components/cbc/ssm_parameter_applications_map.tf @@ -5,9 +5,9 @@ resource "random_password" "mock_application_id" { } resource "aws_ssm_parameter" "applications_map" { - name = local.applications_map_parameter_name + name = local.callbacks.applications_map_parameter_name type = "SecureString" - key_id = module.kms.key_arn + key_id = local.callbacks.kms_key_arn value = var.deploy_mock_clients ? jsonencode({ for id in keys(local.config_clients) : local.config_clients[id].clientId => random_password.mock_application_id[id].result diff --git a/infrastructure/terraform/components/cbc/sync-client-config.sh b/infrastructure/terraform/components/cbc/sync-client-config.sh new file mode 100644 index 00000000..2c2a3ecb --- /dev/null +++ b/infrastructure/terraform/components/cbc/sync-client-config.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash + +# Seeds local client subscription JSON files from S3 into modules/clients/ before Terraform runs. +# Terraform reads those files via fileset() to build local.config_clients. +# On first apply the bucket may not exist yet; this is handled gracefully. + +set -euo pipefail + +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=_paths.sh +source "${script_dir}/_paths.sh" + +: "${ENVIRONMENT:?ENVIRONMENT must be set}" +: "${AWS_REGION:?AWS_REGION must be set}" +: "${AWS_ACCOUNT_ID:?AWS_ACCOUNT_ID must be set}" + +cd "${repo_root}" + +rm -f "${clients_dir}"/*.json + +bucket_name="nhs-${AWS_ACCOUNT_ID}-${AWS_REGION}-${ENVIRONMENT}-callbacks-subscription-config" + +s3_prefix="client_subscriptions/" + +echo "Seeding client configs from s3://${bucket_name}/${s3_prefix} for ${ENVIRONMENT}/${AWS_REGION}" + +if ! sync_output=$(aws s3 sync "s3://${bucket_name}/${s3_prefix}" "${clients_dir}/" \ + --region "${AWS_REGION}" \ + --exclude "*" \ + --include "*.json" \ + --only-show-errors 2>&1); then + if [[ "${sync_output}" == *"NoSuchBucket"* ]]; then + # Expected on first apply before Terraform creates the bucket. + echo "Client config bucket not found yet; skipping sync for first run" + else + echo "Failed to sync client config from S3" >&2 + echo "${sync_output}" >&2 + exit 1 + fi +fi + +# Ensure an empty directory produces a zero-length array rather than a literal "*.json" entry. +shopt -s nullglob +seeded_files=("${clients_dir}"/*.json) +seeded_count="${#seeded_files[@]}" +shopt -u nullglob + +echo "Seeded ${seeded_count} client config file(s)" diff --git a/infrastructure/terraform/components/cbc/variables.tf b/infrastructure/terraform/components/cbc/variables.tf new file mode 100644 index 00000000..790020b7 --- /dev/null +++ b/infrastructure/terraform/components/cbc/variables.tf @@ -0,0 +1,96 @@ +## +# Basic Required Variables for tfscaffold Components +## + +variable "project" { + type = string + description = "The name of the tfscaffold project" +} + +variable "environment" { + type = string + description = "The name of the tfscaffold environment" +} + +variable "aws_account_id" { + type = string + description = "The AWS Account ID (numeric)" +} + +variable "region" { + type = string + description = "The AWS Region" +} + +variable "group" { + type = string + description = "The group variables are being inherited from (often synonmous with account short-name)" +} + +## +# tfscaffold variables specific to this component +## + +variable "component" { + type = string + description = "The variable encapsulating the name of this component" + default = "cbc" +} + +variable "default_tags" { + type = map(string) + description = "A map of default tags to apply to all taggable resources within the component" + default = {} +} + +variable "parent_acct_environment" { + type = string + description = "Name of the environment responsible for the acct resources used, affects things like DNS zone. Useful for named dev environments" + default = "main" +} + +## +# Variables specific to the component +## + +variable "deploy_mock_clients" { + type = bool + description = "Flag to deploy mock client subscription config for integration testing (test/dev environments only)" + default = false +} + +variable "deploy_perf_runner" { + type = bool + description = "Flag to deploy performance test client subscription fixtures" + default = false +} + +variable "token_bucket_burst_capacity" { + type = number + description = "Token bucket burst capacity used by the rate limiter" + default = 2250 +} + +variable "log_retention_in_days" { + type = number + description = "The retention period in days for the Cloudwatch Logs events to be retained, default of 0 is indefinite" + default = 0 +} + +variable "log_level" { + type = string + description = "The log level to be used in lambda functions within the component" + default = "INFO" +} + +variable "force_lambda_code_deploy" { + type = bool + description = "If the lambda package in s3 has the same commit id tag as the terraform build branch, the lambda will not update automatically. Set to True if making changes to Lambda code from on the same commit for example during development" + default = false +} + +variable "enable_xray_tracing" { + type = bool + description = "Enable AWS X-Ray active tracing for Lambda functions" + default = false +} diff --git a/infrastructure/terraform/components/cbc/versions.tf b/infrastructure/terraform/components/cbc/versions.tf new file mode 100644 index 00000000..55552749 --- /dev/null +++ b/infrastructure/terraform/components/cbc/versions.tf @@ -0,0 +1,14 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "6.13" + } + random = { + source = "hashicorp/random" + version = "~> 3.0" + } + } + + required_version = ">= 1.10.1" +} diff --git a/infrastructure/terraform/modules/client-delivery/README.md b/infrastructure/terraform/modules/client-delivery/README.md index 0a4965e7..2036c60d 100644 --- a/infrastructure/terraform/modules/client-delivery/README.md +++ b/infrastructure/terraform/modules/client-delivery/README.md @@ -45,6 +45,7 @@ No requirements. | [sqs\_visibility\_timeout\_seconds](#input\_sqs\_visibility\_timeout\_seconds) | Visibility timeout for the per-client delivery queue | `number` | `60` | no | | [subscription\_targets](#input\_subscription\_targets) | Flattened subscription-target fanout map keyed by subscription-target composite key |
map(object({
subscription_id = string
target_id = string
}))
| n/a | yes | | [subscriptions](#input\_subscriptions) | Subscription definitions for this client, keyed by subscription\_id |
map(object({
subscription_id = string
target_ids = list(string)
}))
| n/a | yes | +| [token\_bucket\_burst\_capacity](#input\_token\_bucket\_burst\_capacity) | Token bucket burst capacity used by the rate limiter | `number` | `2250` | no | | [vpc\_subnet\_ids](#input\_vpc\_subnet\_ids) | VPC subnet IDs for Lambda execution | `list(string)` | `[]` | no | ## Modules diff --git a/infrastructure/terraform/modules/client-delivery/module_https_client_lambda.tf b/infrastructure/terraform/modules/client-delivery/module_https_client_lambda.tf index 1260d471..0021fb80 100644 --- a/infrastructure/terraform/modules/client-delivery/module_https_client_lambda.tf +++ b/infrastructure/terraform/modules/client-delivery/module_https_client_lambda.tf @@ -53,6 +53,7 @@ module "https_client_lambda" { MTLS_TEST_CERT_S3_BUCKET = var.mtls_test_cert_s3_bucket MTLS_TEST_CERT_S3_KEY = var.mtls_test_cert_s3_key # gitleaks:allow QUEUE_URL = module.sqs_delivery.sqs_queue_url + TOKEN_BUCKET_BURST_CAPACITY = tostring(var.token_bucket_burst_capacity) } vpc_config = var.lambda_security_group_id != "" ? { diff --git a/infrastructure/terraform/modules/client-delivery/variables.tf b/infrastructure/terraform/modules/client-delivery/variables.tf index 643e163e..801ca291 100644 --- a/infrastructure/terraform/modules/client-delivery/variables.tf +++ b/infrastructure/terraform/modules/client-delivery/variables.tf @@ -181,6 +181,12 @@ variable "mtls_test_ca_s3_key" { default = "" } +variable "token_bucket_burst_capacity" { + type = number + description = "Token bucket burst capacity used by the rate limiter" + default = 2250 +} + variable "elasticache_endpoint" { type = string description = "ElastiCache Serverless endpoint URL" diff --git a/lambdas/https-client-lambda/src/__tests__/admit-lua.test.ts b/lambdas/https-client-lambda/src/__tests__/admit-lua.test.ts index 6aab4727..43aa2fb6 100644 --- a/lambdas/https-client-lambda/src/__tests__/admit-lua.test.ts +++ b/lambdas/https-client-lambda/src/__tests__/admit-lua.test.ts @@ -1,32 +1,32 @@ import admitLuaSrc from "services/admit.lua"; import { createRedisStore, evalLua } from "__tests__/helpers/lua-redis-mock"; -// ARGV: [now, capacity, refillPerSec, cooldownMs, decayPeriodMs, cbWindowPeriodMs, cbProbeIntervalMs] -// KEYS: [cbKey, rlKey] -// Returns: [allowed (0|1), reason, retryAfterMs, effectiveRate] +// ARGV: [now, capacity, targetRateLimit, cooldownMs, recoveryPeriodMs, probeRateLimit, targetBatchSize] +// KEYS: [epKey] +// Returns: [consumedTokens, reason, retryAfterMs, effectiveRate] type AdmitArgs = { now: number; capacity: number; - refillPerSec: number; + targetRateLimit: number; cooldownMs: number; - decayPeriodMs: number; - cbWindowPeriodMs: number; - cbProbeIntervalMs: number; + recoveryPeriodMs: number; + probeRateLimit: number; + targetBatchSize: number; }; const defaultArgs: AdmitArgs = { now: 1_000_000, - capacity: 10, - refillPerSec: 10, - cooldownMs: 60_000, - decayPeriodMs: 300_000, - cbWindowPeriodMs: 60_000, - cbProbeIntervalMs: 60_000, + capacity: 2250, + targetRateLimit: 10, + cooldownMs: 120_000, + recoveryPeriodMs: 600_000, + probeRateLimit: 1 / 60, + targetBatchSize: 1, }; type AdmitResult = { - allowed: number; + consumedTokens: number; reason: string; retryAfterMs: number; effectiveRate: number; @@ -40,20 +40,20 @@ function runAdmit( const merged = { ...defaultArgs, ...args }; const raw = evalLua( admitLuaSrc, - [`cb:${targetId}`, `rl:${targetId}`], + [`ep:${targetId}`], [ merged.now.toString(), merged.capacity.toString(), - merged.refillPerSec.toString(), + merged.targetRateLimit.toString(), merged.cooldownMs.toString(), - merged.decayPeriodMs.toString(), - merged.cbWindowPeriodMs.toString(), - merged.cbProbeIntervalMs.toString(), + merged.recoveryPeriodMs.toString(), + merged.probeRateLimit.toString(), + merged.targetBatchSize.toString(), ], store, ) as [number, string, number, number]; return { - allowed: raw[0], + consumedTokens: raw[0], reason: raw[1], retryAfterMs: raw[2], effectiveRate: raw[3], @@ -62,399 +62,453 @@ function runAdmit( describe("admit.lua", () => { describe("rate limiting", () => { - it("allows the first request with full token bucket", () => { - const store = createRedisStore(); - const { allowed, effectiveRate, reason, retryAfterMs } = runAdmit(store); - - expect(allowed).toBe(1); - expect(reason).toBe("allowed"); - expect(retryAfterMs).toBe(0); - expect(effectiveRate).toBe(10); - }); - - it("depletes tokens on consecutive calls and rejects when empty", () => { + it("enters half-open probe on a fresh endpoint with no prior state", () => { const store = createRedisStore(); + const now = 1_000_000; - for (let i = 0; i < 10; i++) { - const { allowed } = runAdmit(store); - expect(allowed).toBe(1); - } + const { consumedTokens, effectiveRate, reason } = runAdmit(store, { + now, + targetRateLimit: 10, + }); - const { allowed, reason } = runAdmit(store); - expect(allowed).toBe(0); + expect(consumedTokens).toBe(0); expect(reason).toBe("rate_limited"); + expect(effectiveRate).toBeCloseTo(1 / 60, 5); }); - it("returns retryAfterMs when rate limited", () => { + it("does not persist circuit state on first contact", () => { const store = createRedisStore(); + const now = 1_000_000; - for (let i = 0; i < 10; i++) { - runAdmit(store); - } + runAdmit(store, { now, targetRateLimit: 10 }); - const { retryAfterMs } = runAdmit(store); - expect(retryAfterMs).toBe(1000); + const epHash = store.get("ep:t1")!; + expect(epHash.has("is_open")).toBe(false); + expect(epHash.has("switched_at")).toBe(false); }); - it("reports effective rate when rate limited", () => { + it("allows full rate after record-result closes the circuit", () => { const store = createRedisStore(); + const now = 1_000_000; + + store.set( + "ep:t1", + new Map([ + ["is_open", "0"], + ["switched_at", now.toString()], + ["bucket_tokens", "0"], + ["bucket_refilled_at", now.toString()], + ]), + ); - for (let i = 0; i < 10; i++) { - runAdmit(store); - } + const later = now + 60_000; + const { consumedTokens, reason } = runAdmit(store, { + now: later, + targetRateLimit: 10, + recoveryPeriodMs: 600_000, + }); - const { effectiveRate } = runAdmit(store); - expect(effectiveRate).toBe(10); + expect(consumedTokens).toBeGreaterThanOrEqual(1); + expect(reason).toBe("allowed"); }); - it("refills tokens over time", () => { + it("allows a single request when bucket has tokens from refill", () => { const store = createRedisStore(); const now = 1_000_000; + store.set( + "ep:t1", + new Map([ + ["is_open", "0"], + ["bucket_tokens", "0"], + ["bucket_refilled_at", "0"], + ["switched_at", "0"], + ]), + ); - for (let i = 0; i < 10; i++) { - runAdmit(store, { now }); - } - - const denied = runAdmit(store, { now }); - expect(denied.allowed).toBe(0); + const { consumedTokens, reason, retryAfterMs } = runAdmit(store, { + now, + targetRateLimit: 10, + }); - const refilled = runAdmit(store, { now: now + 1000 }); - expect(refilled.allowed).toBe(1); + expect(consumedTokens).toBe(1); + expect(reason).toBe("allowed"); + expect(retryAfterMs).toBe(0); }); - it("caps tokens at capacity", () => { + it("consumes up to targetBatchSize tokens", () => { const store = createRedisStore(); const now = 1_000_000; + store.set( + "ep:t1", + new Map([ + ["is_open", "0"], + ["bucket_tokens", "5"], + ["bucket_refilled_at", now.toString()], + ["switched_at", "0"], + ]), + ); - runAdmit(store, { now, capacity: 5, refillPerSec: 100 }); - - // Advance 10 seconds — would add 1000 tokens without cap - runAdmit(store, { now: now + 10_000, capacity: 5, refillPerSec: 100 }); - - const rlHash = store.get("rl:t1")!; - // Refill capped to capacity (5), then one consumed → 4 - expect(Number(rlHash.get("tokens"))).toBe(4); + const { consumedTokens } = runAdmit(store, { + now, + targetBatchSize: 3, + }); + expect(consumedTokens).toBe(3); }); - it("handles zero refill rate", () => { + it("consumes all available when batch exceeds available tokens", () => { const store = createRedisStore(); + const now = 1_000_000; + store.set( + "ep:t1", + new Map([ + ["is_open", "0"], + ["bucket_tokens", "2"], + ["bucket_refilled_at", now.toString()], + ["switched_at", "0"], + ]), + ); - for (let i = 0; i < 10; i++) { - runAdmit(store, { refillPerSec: 0 }); - } - - const { allowed, reason, retryAfterMs } = runAdmit(store, { - refillPerSec: 0, + const { consumedTokens } = runAdmit(store, { + now, + targetBatchSize: 5, }); - expect(allowed).toBe(0); - expect(reason).toBe("rate_limited"); - expect(retryAfterMs).toBe(1000); + expect(consumedTokens).toBe(2); }); - }); - describe("circuit breaker", () => { - it("rejects when circuit is open", () => { + it("returns rate_limited when no tokens available", () => { const store = createRedisStore(); const now = 1_000_000; - const openedUntil = now + 60_000; - store.set( - "cb:t1", + "ep:t1", new Map([ - ["opened_until_ms", openedUntil.toString()], - ["last_probe_ms", now.toString()], + ["is_open", "0"], + ["bucket_tokens", "0"], + ["bucket_refilled_at", now.toString()], + ["switched_at", "0"], ]), ); - const { allowed, effectiveRate, reason } = runAdmit(store, { now }); - expect(allowed).toBe(0); - expect(reason).toBe("circuit_open"); - expect(effectiveRate).toBe(0); + const { consumedTokens, reason, retryAfterMs } = runAdmit(store, { now }); + expect(consumedTokens).toBe(0); + expect(reason).toBe("rate_limited"); + expect(retryAfterMs).toBe(1000); }); - it("returns retryAfterMs for open circuit", () => { + it("refills tokens over time", () => { const store = createRedisStore(); const now = 1_000_000; - const openedUntil = now + 30_000; - store.set( - "cb:t1", + "ep:t1", new Map([ - ["opened_until_ms", openedUntil.toString()], - ["last_probe_ms", now.toString()], + ["is_open", "0"], + ["bucket_tokens", "0"], + ["bucket_refilled_at", now.toString()], + ["switched_at", "0"], ]), ); - const { retryAfterMs } = runAdmit(store, { now }); - expect(retryAfterMs).toBe(30_000); + const { consumedTokens } = runAdmit(store, { + now: now + 1000, + targetRateLimit: 10, + }); + expect(consumedTokens).toBe(1); }); - it("allows probe when probe interval has elapsed", () => { + it("caps tokens at capacity", () => { const store = createRedisStore(); const now = 1_000_000; - const openedUntil = now + 120_000; - const lastProbe = now - 61_000; - store.set( - "cb:t1", + "ep:t1", new Map([ - ["opened_until_ms", openedUntil.toString()], - ["last_probe_ms", lastProbe.toString()], + ["is_open", "0"], + ["bucket_tokens", "0"], + ["bucket_refilled_at", "0"], + ["switched_at", "0"], ]), ); - const { allowed, effectiveRate, reason, retryAfterMs } = runAdmit(store, { + const { consumedTokens } = runAdmit(store, { now, - cbProbeIntervalMs: 60_000, + capacity: 5, + targetRateLimit: 100, + targetBatchSize: 10, }); - expect(allowed).toBe(1); - expect(reason).toBe("probe"); - expect(retryAfterMs).toBe(0); - expect(effectiveRate).toBe(0); + expect(consumedTokens).toBe(5); }); - it("updates last_probe_ms after allowing a probe", () => { + it("handles zero refill rate", () => { const store = createRedisStore(); const now = 1_000_000; - const openedUntil = now + 120_000; - const lastProbe = now - 61_000; - store.set( - "cb:t1", + "ep:t1", new Map([ - ["opened_until_ms", openedUntil.toString()], - ["last_probe_ms", lastProbe.toString()], + ["is_open", "0"], + ["bucket_tokens", "0"], + ["bucket_refilled_at", now.toString()], + ["switched_at", "0"], ]), ); - runAdmit(store, { now, cbProbeIntervalMs: 60_000 }); - - const cbHash = store.get("cb:t1")!; - expect(cbHash.get("last_probe_ms")).toBe(now.toString()); + const { consumedTokens, reason } = runAdmit(store, { + now: now + 10_000, + targetRateLimit: 0, + }); + expect(consumedTokens).toBe(0); + expect(reason).toBe("rate_limited"); }); - it("does not probe when interval has not elapsed", () => { + it("preserves fractional refill time (bucketRefilledAt += generationTime, not now)", () => { const store = createRedisStore(); const now = 1_000_000; - const openedUntil = now + 120_000; - const lastProbe = now - 30_000; - store.set( - "cb:t1", + "ep:t1", new Map([ - ["opened_until_ms", openedUntil.toString()], - ["last_probe_ms", lastProbe.toString()], + ["is_open", "0"], + ["bucket_tokens", "0"], + ["bucket_refilled_at", (now - 150).toString()], + ["switched_at", "0"], ]), ); - const { allowed, reason } = runAdmit(store, { - now, - cbProbeIntervalMs: 60_000, - }); - expect(allowed).toBe(0); - expect(reason).toBe("circuit_open"); + runAdmit(store, { now, targetRateLimit: 10 }); + + const epHash = store.get("ep:t1")!; + const refilledAt = Number(epHash.get("bucket_refilled_at")); + // 1 token generated at rate 10/s takes 100ms, so refilledAt = (now-150) + 100 = now - 50 + expect(refilledAt).toBe(now - 50); }); + }); - it("does not probe when cbProbeIntervalMs is 0", () => { + describe("circuit breaker states", () => { + it("blocks completely when circuit is open during cooldown", () => { const store = createRedisStore(); const now = 1_000_000; - const openedUntil = now + 120_000; + const switchedAt = now - 10_000; store.set( - "cb:t1", + "ep:t1", new Map([ - ["opened_until_ms", openedUntil.toString()], - ["last_probe_ms", "0"], + ["is_open", "1"], + ["switched_at", switchedAt.toString()], + ["bucket_tokens", "100"], ]), ); - const { allowed, reason } = runAdmit(store, { + const { consumedTokens, reason } = runAdmit(store, { now, - cbProbeIntervalMs: 0, + cooldownMs: 120_000, }); - expect(allowed).toBe(0); + expect(consumedTokens).toBe(0); expect(reason).toBe("circuit_open"); }); - }); - describe("sliding window", () => { - it("initialises cbWindowFrom on first call", () => { + it("does not consume bucket tokens when fully open", () => { const store = createRedisStore(); const now = 1_000_000; - - runAdmit(store, { now }); - - const cbHash = store.get("cb:t1")!; - expect(cbHash.get("cb_window_from")).toBe(now.toString()); - }); - - it("rolls current window to previous when period expires", () => { - const store = createRedisStore(); - const cbWindowPeriodMs = 60_000; - const t0 = 1_000_000; - const t1 = t0 + cbWindowPeriodMs + 1; + const switchedAt = now - 10_000; store.set( - "cb:t1", + "ep:t1", new Map([ - ["cb_window_from", t0.toString()], - ["cb_failures", "5"], - ["cb_attempts", "10"], - ["cb_prev_failures", "0"], - ["cb_prev_attempts", "0"], + ["is_open", "1"], + ["switched_at", switchedAt.toString()], + ["bucket_tokens", "100"], + ["bucket_refilled_at", now.toString()], ]), ); - runAdmit(store, { now: t1, cbWindowPeriodMs }); + runAdmit(store, { now, cooldownMs: 120_000 }); - const cbHash = store.get("cb:t1")!; - expect(cbHash.get("cb_prev_failures")).toBe("5"); - expect(cbHash.get("cb_prev_attempts")).toBe("10"); - expect(cbHash.get("cb_failures")).toBe("0"); - expect(cbHash.get("cb_attempts")).toBe("0"); - expect(cbHash.get("cb_window_from")).toBe(t1.toString()); + const epHash = store.get("ep:t1")!; + expect(Number(epHash.get("bucket_tokens"))).toBe(100); }); - it("clears both windows when gap exceeds two periods", () => { + it("returns retryAfterMs for open circuit", () => { const store = createRedisStore(); - const cbWindowPeriodMs = 60_000; - const t0 = 1_000_000; - const t1 = t0 + 2 * cbWindowPeriodMs + 1; + const now = 1_000_000; + const switchedAt = now - 10_000; store.set( - "cb:t1", + "ep:t1", new Map([ - ["cb_window_from", t0.toString()], - ["cb_failures", "5"], - ["cb_attempts", "10"], - ["cb_prev_failures", "3"], - ["cb_prev_attempts", "7"], + ["is_open", "1"], + ["switched_at", switchedAt.toString()], ]), ); - runAdmit(store, { now: t1, cbWindowPeriodMs }); - - const cbHash = store.get("cb:t1")!; - expect(cbHash.get("cb_prev_failures")).toBe("0"); - expect(cbHash.get("cb_prev_attempts")).toBe("0"); - expect(cbHash.get("cb_failures")).toBe("0"); - expect(cbHash.get("cb_attempts")).toBe("0"); - expect(cbHash.get("cb_window_from")).toBe(t1.toString()); + const { retryAfterMs } = runAdmit(store, { now, cooldownMs: 120_000 }); + expect(retryAfterMs).toBe(110_000); }); - }); - describe("decay scaling", () => { - it("applies reduced rate during decay period", () => { + it("uses probeRateLimit when half-open (after cooldown)", () => { const store = createRedisStore(); - const closedAt = 1_000_000; - const decayPeriodMs = 300_000; - const halfwayThrough = closedAt + decayPeriodMs / 2; + const now = 1_000_000; + const switchedAt = now - 130_000; - store.set("cb:t1", new Map([["opened_until_ms", closedAt.toString()]])); + store.set( + "ep:t1", + new Map([ + ["is_open", "1"], + ["switched_at", switchedAt.toString()], + ["bucket_tokens", "0"], + ["bucket_refilled_at", (now - 60_000).toString()], + ]), + ); const { effectiveRate } = runAdmit(store, { - now: halfwayThrough, - refillPerSec: 10, - decayPeriodMs, + now, + cooldownMs: 120_000, + probeRateLimit: 1 / 60, }); - expect(effectiveRate).toBe(5); + expect(effectiveRate).toBeCloseTo(1 / 60, 5); }); - it("uses full rate after decay period ends", () => { + it("zeroes residual bucket tokens when circuit is half-open", () => { const store = createRedisStore(); - const closedAt = 1_000_000; - const decayPeriodMs = 300_000; - const afterDecay = closedAt + decayPeriodMs + 1; + const now = 1_000_000; + const switchedAt = now - 130_000; - store.set("cb:t1", new Map([["opened_until_ms", closedAt.toString()]])); + store.set( + "ep:t1", + new Map([ + ["is_open", "1"], + ["switched_at", switchedAt.toString()], + ["bucket_tokens", "100"], + ["bucket_refilled_at", (now - 60_000).toString()], + ]), + ); - const { allowed, effectiveRate } = runAdmit(store, { - now: afterDecay, - refillPerSec: 10, - decayPeriodMs, + const { consumedTokens } = runAdmit(store, { + now, + cooldownMs: 120_000, + probeRateLimit: 1 / 60, }); - expect(allowed).toBe(1); - expect(effectiveRate).toBe(10); + + expect(consumedTokens).toBe(1); + const epHash = store.get("ep:t1")!; + expect(Number(epHash.get("bucket_tokens"))).toBe(0); }); - it("clamps minimum effective rate to 1", () => { + it("uses recovery ramp when closed during recovery period", () => { const store = createRedisStore(); - const closedAt = 1_000_000; - const decayPeriodMs = 300_000; - const veryEarly = closedAt + 1; + const switchedAt = 1_000_000; + const recoveryPeriodMs = 600_000; + const now = switchedAt + recoveryPeriodMs / 2; - store.set("cb:t1", new Map([["opened_until_ms", closedAt.toString()]])); + store.set( + "ep:t1", + new Map([ + ["is_open", "0"], + ["switched_at", switchedAt.toString()], + ["bucket_tokens", "0"], + ["bucket_refilled_at", "0"], + ]), + ); const { effectiveRate } = runAdmit(store, { - now: veryEarly, - refillPerSec: 10, - decayPeriodMs, + now, + targetRateLimit: 10, + recoveryPeriodMs, }); - expect(effectiveRate).toBeGreaterThanOrEqual(1); + const probeRate = defaultArgs.probeRateLimit; + const expectedRate = probeRate + 0.5 * (10 - probeRate); + expect(effectiveRate).toBeCloseTo(expectedRate, 5); }); - it("clears openedUntil when decay period fully elapses", () => { + it("uses full rate when closed and past recovery period", () => { const store = createRedisStore(); - const closedAt = 1_000_000; - const decayPeriodMs = 300_000; - const afterDecay = closedAt + decayPeriodMs + 1; - - store.set("cb:t1", new Map([["opened_until_ms", closedAt.toString()]])); + const switchedAt = 100_000; + const recoveryPeriodMs = 600_000; + const now = switchedAt + recoveryPeriodMs + 1; - runAdmit(store, { now: afterDecay, decayPeriodMs }); + store.set( + "ep:t1", + new Map([ + ["is_open", "0"], + ["switched_at", switchedAt.toString()], + ["bucket_tokens", "0"], + ["bucket_refilled_at", "0"], + ]), + ); - const cbHash = store.get("cb:t1")!; - expect(cbHash.get("opened_until_ms")).toBe("0"); + const { effectiveRate } = runAdmit(store, { + now, + targetRateLimit: 10, + recoveryPeriodMs, + }); + expect(effectiveRate).toBe(10); }); + }); - it("does not decay when decayPeriodMs is 0", () => { + describe("state persistence", () => { + it("persists bucket_tokens and bucket_refilled_at", () => { const store = createRedisStore(); - const closedAt = 1_000_000; + const now = 1_000_000; + store.set( + "ep:t1", + new Map([ + ["is_open", "0"], + ["bucket_tokens", "5"], + ["bucket_refilled_at", now.toString()], + ["switched_at", "0"], + ]), + ); - store.set("cb:t1", new Map([["opened_until_ms", closedAt.toString()]])); + runAdmit(store, { now, targetBatchSize: 2 }); - const { allowed, effectiveRate } = runAdmit(store, { - now: closedAt + 1, - refillPerSec: 10, - decayPeriodMs: 0, - }); - expect(allowed).toBe(1); - expect(effectiveRate).toBe(10); + const epHash = store.get("ep:t1")!; + expect(Number(epHash.get("bucket_tokens"))).toBe(3); }); - }); - describe("state persistence", () => { - it("persists token count and last_refill_ms", () => { + it("does not write any fields when circuit_open early return", () => { const store = createRedisStore(); - runAdmit(store, { now: 1_000_000, capacity: 5 }); + runAdmit(store, { + now: 10_000, + }); - const rlHash = store.get("rl:t1")!; - expect(rlHash.get("tokens")).toBeDefined(); - expect(rlHash.get("last_refill_ms")).toBe("1000000"); + expect(store.has("ep:t1")).toBe(false); }); - it("persists circuit breaker fields", () => { + it("does not write sampling or circuit fields on half-open path", () => { const store = createRedisStore(); - runAdmit(store, { now: 1_000_000 }); - - const cbHash = store.get("cb:t1")!; - expect(cbHash.has("opened_until_ms")).toBe(true); - expect(cbHash.has("cb_window_from")).toBe(true); - expect(cbHash.has("cb_failures")).toBe(true); - expect(cbHash.has("cb_attempts")).toBe(true); - expect(cbHash.has("cb_prev_failures")).toBe(true); - expect(cbHash.has("cb_prev_attempts")).toBe(true); + runAdmit(store, { + now: 200_000, + }); + + const epHash = store.get("ep:t1")!; + expect(epHash.has("bucket_tokens")).toBe(true); + expect(epHash.has("bucket_refilled_at")).toBe(true); + expect(epHash.has("cur_attempts")).toBe(false); + expect(epHash.has("cur_failures")).toBe(false); + expect(epHash.has("sample_till")).toBe(false); + expect(epHash.has("is_open")).toBe(false); + expect(epHash.has("switched_at")).toBe(false); }); it("isolates state between targets", () => { const store = createRedisStore(); - runAdmit(store, {}, "target-a"); - runAdmit(store, {}, "target-b"); + store.set( + "ep:target-a", + new Map([ + ["is_open", "0"], + ["bucket_tokens", "5"], + ["bucket_refilled_at", "10000"], + ]), + ); + store.set( + "ep:target-b", + new Map([ + ["is_open", "0"], + ["bucket_tokens", "3"], + ["bucket_refilled_at", "10000"], + ]), + ); + + runAdmit(store, { now: 10_000 }, "target-a"); + runAdmit(store, { now: 10_000 }, "target-b"); - expect(store.has("cb:target-a")).toBe(true); - expect(store.has("cb:target-b")).toBe(true); - expect(store.has("rl:target-a")).toBe(true); - expect(store.has("rl:target-b")).toBe(true); + expect(store.has("ep:target-a")).toBe(true); + expect(store.has("ep:target-b")).toBe(true); }); }); }); diff --git a/lambdas/https-client-lambda/src/__tests__/endpoint-gate.test.ts b/lambdas/https-client-lambda/src/__tests__/endpoint-gate.test.ts index efbc6d88..eea9d44d 100644 --- a/lambdas/https-client-lambda/src/__tests__/endpoint-gate.test.ts +++ b/lambdas/https-client-lambda/src/__tests__/endpoint-gate.test.ts @@ -11,13 +11,13 @@ const mockDisconnect = jest.fn().mockResolvedValue(undefined); const mockOn = jest.fn(); const defaultConfig: EndpointGateConfig = { - burstCapacity: 10, - cbProbeIntervalMs: 60_000, - decayPeriodMs: 300_000, - cbWindowPeriodMs: 60_000, - cbErrorThreshold: 0.5, - cbMinAttempts: 10, - cbCooldownMs: 60_000, + burstCapacity: 2250, + probeRateLimit: 1 / 60, + recoveryPeriodMs: 600_000, + samplePeriodMs: 300_000, + failureThreshold: 0.3, + minAttempts: 5, + cooldownPeriodMs: 120_000, }; const mockRedis = { @@ -34,12 +34,23 @@ beforeEach(() => { }); describe("admit", () => { - it("returns allowed when tokens available", async () => { - mockSendCommand.mockResolvedValueOnce([1, "allowed", 0, 10]); + it("returns allowed with consumedTokens when tokens available", async () => { + mockSendCommand.mockResolvedValueOnce([5, "allowed", 0, 10]); - const result = await admit(mockRedis, "target-1", 10, true, defaultConfig); + const result = await admit( + mockRedis, + "target-1", + 10, + true, + 5, + defaultConfig, + ); - expect(result).toEqual({ allowed: true, probe: false, effectiveRate: 10 }); + expect(result).toEqual({ + allowed: true, + consumedTokens: 5, + effectiveRate: 10, + }); expect(mockSendCommand).toHaveBeenCalledWith( expect.arrayContaining(["EVALSHA"]), ); @@ -48,7 +59,14 @@ describe("admit", () => { it("returns rate_limited when tokens exhausted", async () => { mockSendCommand.mockResolvedValueOnce([0, "rate_limited", 1000, 10]); - const result = await admit(mockRedis, "target-1", 10, false, defaultConfig); + const result = await admit( + mockRedis, + "target-1", + 10, + false, + 5, + defaultConfig, + ); expect(result).toEqual({ allowed: false, @@ -58,18 +76,17 @@ describe("admit", () => { }); }); - it("returns allowed with probe flag when circuit is open but probe slot is available", async () => { - mockSendCommand.mockResolvedValueOnce([1, "probe", 0, 0]); - - const result = await admit(mockRedis, "target-1", 10, true, defaultConfig); - - expect(result).toEqual({ allowed: true, probe: true, effectiveRate: 0 }); - }); - - it("returns circuit_open without probe slot", async () => { + it("returns circuit_open when circuit is fully open", async () => { mockSendCommand.mockResolvedValueOnce([0, "circuit_open", 30_000, 0]); - const result = await admit(mockRedis, "target-1", 10, true, defaultConfig); + const result = await admit( + mockRedis, + "target-1", + 10, + true, + 5, + defaultConfig, + ); expect(result).toEqual({ allowed: false, @@ -84,9 +101,20 @@ describe("admit", () => { .mockRejectedValueOnce(new Error("NOSCRIPT No matching script")) .mockResolvedValueOnce([1, "allowed", 0, 10]); - const result = await admit(mockRedis, "target-1", 10, true, defaultConfig); + const result = await admit( + mockRedis, + "target-1", + 10, + true, + 1, + defaultConfig, + ); - expect(result).toEqual({ allowed: true, probe: false, effectiveRate: 10 }); + expect(result).toEqual({ + allowed: true, + consumedTokens: 1, + effectiveRate: 10, + }); expect(mockSendCommand).toHaveBeenCalledTimes(2); expect(mockSendCommand).toHaveBeenNthCalledWith( 1, @@ -98,25 +126,33 @@ describe("admit", () => { ); }); - it("passes cbProbeIntervalMs=0 when circuit breaker is disabled", async () => { + it("passes probeRateLimit=0 when circuit breaker is disabled", async () => { mockSendCommand.mockResolvedValueOnce([1, "allowed", 0, 10]); - await admit(mockRedis, "target-1", 10, false, defaultConfig); + await admit(mockRedis, "target-1", 10, false, 1, defaultConfig); - // EVALSHA layout: [EVALSHA, sha, keyCount, cbKey, rlKey, now, capacity, refillPerSec, cooldownMs, decayPeriodMs, cbWindowPeriodMs, cbProbeIntervalMs] const args = mockSendCommand.mock.calls[0]![0] as string[]; - const cbProbeIntervalArg = args[11]; - expect(cbProbeIntervalArg).toBe("0"); + const probeRateArg = args[9]; + expect(probeRateArg).toBe("0"); }); - it("passes cbKey first, rlKey second", async () => { + it("passes single epKey", async () => { mockSendCommand.mockResolvedValueOnce([1, "allowed", 0, 5]); - await admit(mockRedis, "my-target", 5, true, defaultConfig); + await admit(mockRedis, "my-target", 5, true, 1, defaultConfig); + + const args = mockSendCommand.mock.calls[0]![0] as string[]; + expect(args[3]).toBe("ep:{my-target}"); + }); + + it("passes targetBatchSize as ARGV", async () => { + mockSendCommand.mockResolvedValueOnce([3, "allowed", 0, 10]); + + await admit(mockRedis, "target-1", 10, true, 7, defaultConfig); const args = mockSendCommand.mock.calls[0]![0] as string[]; - expect(args[3]).toBe("cb:{my-target}"); - expect(args[4]).toBe("rl:{my-target}"); + const batchSizeArg = args[10]; + expect(batchSizeArg).toBe("7"); }); }); @@ -130,6 +166,7 @@ describe("evalScript", () => { "target-1", 10, true, + 1, defaultConfig, ).catch((error: unknown) => error); @@ -149,6 +186,7 @@ describe("evalScript", () => { "target-1", 10, true, + 1, defaultConfig, ).catch((error: unknown) => error); @@ -159,70 +197,101 @@ describe("evalScript", () => { }); describe("recordResult", () => { - it("returns closed on success below threshold", async () => { - mockSendCommand.mockResolvedValueOnce([1, "closed"]); + it("returns closed state when circuit is steady-state", async () => { + mockSendCommand.mockResolvedValueOnce(["closed", 0]); const result = await recordResult( mockRedis, "target-1", - true, + 5, + 0, defaultConfig, ); - expect(result).toEqual({ ok: true, state: "closed" }); + expect(result).toEqual({ circuitState: "closed", stateChanged: false }); expect(mockSendCommand).toHaveBeenCalledWith( expect.arrayContaining(["EVALSHA"]), ); }); - it("returns opened when failure crosses threshold", async () => { - mockSendCommand.mockResolvedValueOnce([0, "opened"]); + it("returns open with stateChanged when failure crosses threshold", async () => { + mockSendCommand.mockResolvedValueOnce(["open", 1]); const result = await recordResult( mockRedis, "target-1", - false, + 5, + 5, defaultConfig, ); - expect(result).toEqual({ ok: false, state: "opened" }); + expect(result).toEqual({ circuitState: "open", stateChanged: true }); }); - it("returns failed when failure is below threshold", async () => { - mockSendCommand.mockResolvedValueOnce([0, "failed"]); + it("returns closed_recovery with stateChanged when circuit closes", async () => { + mockSendCommand.mockResolvedValueOnce(["closed_recovery", 1]); const result = await recordResult( mockRedis, "target-1", - false, + 5, + 0, + defaultConfig, + ); + + expect(result).toEqual({ + circuitState: "closed_recovery", + stateChanged: true, + }); + }); + + it("returns half_open without stateChanged when probing", async () => { + mockSendCommand.mockResolvedValueOnce(["half_open", 0]); + + const result = await recordResult( + mockRedis, + "target-1", + 5, + 1, defaultConfig, ); - expect(result).toEqual({ ok: false, state: "failed" }); + expect(result).toEqual({ circuitState: "half_open", stateChanged: false }); }); it("falls back to EVAL on NOSCRIPT error", async () => { mockSendCommand .mockRejectedValueOnce(new Error("NOSCRIPT No matching script")) - .mockResolvedValueOnce([1, "closed"]); + .mockResolvedValueOnce(["closed", 0]); const result = await recordResult( mockRedis, "target-1", - true, + 1, + 0, defaultConfig, ); - expect(result).toEqual({ ok: true, state: "closed" }); + expect(result).toEqual({ circuitState: "closed", stateChanged: false }); expect(mockSendCommand).toHaveBeenCalledTimes(2); }); - it("passes correct cb key for target", async () => { - mockSendCommand.mockResolvedValueOnce([1, "closed"]); + it("passes correct ep key for target", async () => { + mockSendCommand.mockResolvedValueOnce(["closed", 0]); + + await recordResult(mockRedis, "my-target", 1, 0, defaultConfig); + + const args = mockSendCommand.mock.calls[0]![0] as string[]; + expect(args[3]).toBe("ep:{my-target}"); + }); + + it("passes consumedTokens and processingFailures as ARGV", async () => { + mockSendCommand.mockResolvedValueOnce(["closed", 0]); - await recordResult(mockRedis, "my-target", true, defaultConfig); + await recordResult(mockRedis, "target-1", 8, 3, defaultConfig); const args = mockSendCommand.mock.calls[0]![0] as string[]; - expect(args[3]).toBe("cb:{my-target}"); + expect(args[5]).toBe("8"); + expect(args[6]).toBe("3"); }); }); diff --git a/lambdas/https-client-lambda/src/__tests__/handler.test.ts b/lambdas/https-client-lambda/src/__tests__/handler.test.ts index 3b8ad521..ab5dff46 100644 --- a/lambdas/https-client-lambda/src/__tests__/handler.test.ts +++ b/lambdas/https-client-lambda/src/__tests__/handler.test.ts @@ -3,7 +3,6 @@ import { DEFAULT_TARGET, makeRecord, } from "__tests__/fixtures/handler-fixtures"; -import { VisibilityManagedError } from "services/visibility-managed-error"; jest.mock("@nhs-notify-client-callbacks/logger", () => ({ logger: { @@ -74,17 +73,20 @@ jest.mock("services/redis-client", () => ({ getRedisClient: (...args: unknown[]) => mockGetRedisClient(...args), })); +jest.mock("services/delivery-observability", () => ({ + recordAdmissionDenied: jest.fn(), + recordCircuitBreakerClosed: jest.fn(), + recordCircuitBreakerOpen: jest.fn(), + recordDeliveryAttempt: jest.fn(), + recordDeliveryDuration: jest.fn(), + recordDeliveryFailure: jest.fn(), + recordDeliveryPermanentFailure: jest.fn(), + recordDeliveryRateLimited: jest.fn(), + recordDeliverySuccess: jest.fn(), + recordRetryWindowExhausted: jest.fn(), +})); + jest.mock("services/delivery-metrics", () => ({ - emitAdmissionDenied: jest.fn(), - emitCircuitBreakerClosed: jest.fn(), - emitCircuitBreakerOpen: jest.fn(), - emitDeliveryAttempt: jest.fn(), - emitDeliveryDuration: jest.fn(), - emitDeliveryFailure: jest.fn(), - emitDeliveryPermanentFailure: jest.fn(), - emitDeliverySuccess: jest.fn(), - emitRateLimited: jest.fn(), - emitRetryWindowExhausted: jest.fn(), flushMetrics: jest.fn().mockResolvedValue(undefined), resetMetrics: jest.fn(), })); @@ -106,15 +108,15 @@ describe("processRecords", () => { mockJitteredBackoff.mockReturnValue(5); mockIsWindowExhausted.mockReturnValue(false); mockHandleRateLimitedRecord.mockRejectedValue( - new VisibilityManagedError("Rate limited — requeue"), + new Error("Rate limited — requeue"), ); mockGetRedisClient.mockResolvedValue({}); mockAdmit.mockResolvedValue({ allowed: true, - probe: false, + consumedTokens: 100, effectiveRate: 10, }); - mockRecordResult.mockResolvedValue({ ok: true, state: "closed" }); + mockRecordResult.mockResolvedValue({ ok: true, state: "ok" }); }); it("returns no failures on successful delivery", async () => { @@ -159,7 +161,7 @@ describe("processRecords", () => { expect(failures).toEqual([{ itemIdentifier: "msg-1" }]); }); - it("returns failure for 429 rate-limited responses", async () => { + it("returns failure for 429 when handleRateLimitedRecord rejects", async () => { mockDeliverPayload.mockResolvedValue({ outcome: "rate_limited", retryAfterHeader: "60", @@ -177,7 +179,7 @@ describe("processRecords", () => { ); }); - it("processes multiple records independently", async () => { + it("processes multiple records in a single target batch", async () => { const record1 = makeRecord({ messageId: "msg-1" }); const record2 = makeRecord({ messageId: "msg-2" }); @@ -191,25 +193,45 @@ describe("processRecords", () => { const failures = await processRecords([record1, record2]); expect(failures).toEqual([{ itemIdentifier: "msg-2" }]); + expect(mockAdmit).toHaveBeenCalledTimes(1); }); - it("an unexpected error on one record does not prevent subsequent records being processed", async () => { + it("delivers only admitted records when consumedTokens is less than batch size", async () => { const record1 = makeRecord({ messageId: "msg-1" }); const record2 = makeRecord({ messageId: "msg-2" }); + const record3 = makeRecord({ messageId: "msg-3" }); + + mockAdmit.mockResolvedValue({ + allowed: true, + consumedTokens: 1, + effectiveRate: 10, + }); + + const failures = await processRecords([record1, record2, record3]); - mockLoadTargetConfig - .mockRejectedValueOnce(new Error("S3 unavailable")) - .mockResolvedValueOnce(DEFAULT_TARGET); + expect(mockDeliverPayload).toHaveBeenCalledTimes(1); + expect(failures).toEqual([ + { itemIdentifier: "msg-2" }, + { itemIdentifier: "msg-3" }, + ]); + }); + + it("an unexpected delivery error does not prevent other records in the batch", async () => { + const record1 = makeRecord({ messageId: "msg-1" }); + const record2 = makeRecord({ messageId: "msg-2" }); + + mockDeliverPayload + .mockRejectedValueOnce(new Error("Connection reset")) + .mockResolvedValueOnce({ outcome: "success" }); const failures = await processRecords([record1, record2]); expect(failures).toEqual([{ itemIdentifier: "msg-1" }]); - expect(mockDeliverPayload).toHaveBeenCalledTimes(1); expect(mockChangeVisibility).toHaveBeenCalledWith("receipt-1", 5); }); it("applies jittered backoff cooldown on unexpected errors", async () => { - mockLoadTargetConfig.mockRejectedValue(new Error("Infrastructure error")); + mockDeliverPayload.mockRejectedValue(new Error("Infrastructure error")); const failures = await processRecords([makeRecord()]); @@ -217,7 +239,7 @@ describe("processRecords", () => { expect(mockChangeVisibility).toHaveBeenCalledWith("receipt-1", 5); }); - it("does not apply a second visibility change for admission-denied (managed path)", async () => { + it("changes visibility once per record for admission-denied batch", async () => { mockAdmit.mockResolvedValue({ allowed: false, reason: "rate_limited", @@ -230,7 +252,7 @@ describe("processRecords", () => { expect(mockChangeVisibility).toHaveBeenCalledTimes(1); }); - it("does not apply a second visibility change for transient failure (managed path)", async () => { + it("changes visibility once for transient failure", async () => { mockDeliverPayload.mockResolvedValue({ outcome: "transient_failure", statusCode: 503, @@ -241,13 +263,13 @@ describe("processRecords", () => { expect(mockChangeVisibility).toHaveBeenCalledTimes(1); }); - it("returns failure when CLIENT_ID is not set", async () => { + it("throws when CLIENT_ID is not set", async () => { const saved = process.env.CLIENT_ID; delete process.env.CLIENT_ID; - const failures = await processRecords([makeRecord()]); - - expect(failures).toEqual([{ itemIdentifier: "msg-1" }]); + await expect(processRecords([makeRecord()])).rejects.toThrow( + "CLIENT_ID is required", + ); process.env.CLIENT_ID = saved; }); @@ -262,7 +284,7 @@ describe("processRecords", () => { expect(mockDeliverPayload).not.toHaveBeenCalled(); }); - it("calls changeVisibility with backoff on 5xx then throws", async () => { + it("calls changeVisibility with backoff on 5xx", async () => { mockDeliverPayload.mockResolvedValue({ outcome: "transient_failure", statusCode: 503, @@ -303,7 +325,7 @@ describe("processRecords", () => { expect(failures).toEqual([]); }); - it("requeues when rate limited by endpoint gate", async () => { + it("requeues all records when rate limited by endpoint gate", async () => { mockAdmit.mockResolvedValue({ allowed: false, reason: "rate_limited", @@ -314,12 +336,14 @@ describe("processRecords", () => { const failures = await processRecords([makeRecord()]); expect(failures).toEqual([{ itemIdentifier: "msg-1" }]); - expect(mockChangeVisibility).toHaveBeenCalledWith("receipt-1", 2); + const visibilityDelay = mockChangeVisibility.mock.calls[0]![1] as number; + expect(visibilityDelay).toBeGreaterThanOrEqual(2); + expect(visibilityDelay).toBeLessThanOrEqual(6); expect(mockSendToDlq).not.toHaveBeenCalled(); expect(mockDeliverPayload).not.toHaveBeenCalled(); }); - it("requeues when circuit is open", async () => { + it("requeues all records when circuit is open", async () => { mockAdmit.mockResolvedValue({ allowed: false, reason: "circuit_open", @@ -330,7 +354,9 @@ describe("processRecords", () => { const failures = await processRecords([makeRecord()]); expect(failures).toEqual([{ itemIdentifier: "msg-1" }]); - expect(mockChangeVisibility).toHaveBeenCalledWith("receipt-1", 30); + const visibilityDelay = mockChangeVisibility.mock.calls[0]![1] as number; + expect(visibilityDelay).toBeGreaterThanOrEqual(30); + expect(visibilityDelay).toBeLessThanOrEqual(34); expect(mockSendToDlq).not.toHaveBeenCalled(); expect(mockDeliverPayload).not.toHaveBeenCalled(); }); @@ -350,17 +376,23 @@ describe("processRecords", () => { "target-1", 10, false, + 1, expect.any(Object), ); expect(mockDeliverPayload).toHaveBeenCalled(); }); - it("calls recordResult(true) on successful delivery when CB enabled", async () => { + it("calls recordResult with batch counts on successful delivery when CB enabled", async () => { const targetCb = { ...DEFAULT_TARGET, delivery: { circuitBreaker: { enabled: true } }, }; mockLoadTargetConfig.mockResolvedValue(targetCb); + mockAdmit.mockResolvedValue({ + allowed: true, + consumedTokens: 1, + effectiveRate: 10, + }); const failures = await processRecords([makeRecord()]); @@ -368,17 +400,23 @@ describe("processRecords", () => { expect(mockRecordResult).toHaveBeenCalledWith( expect.anything(), "target-1", - true, + 1, + 0, expect.any(Object), ); }); - it("calls recordResult(false) on 5xx before visibility change", async () => { + it("calls recordResult with failure count on 5xx when CB enabled", async () => { const targetCb = { ...DEFAULT_TARGET, delivery: { circuitBreaker: { enabled: true } }, }; mockLoadTargetConfig.mockResolvedValue(targetCb); + mockAdmit.mockResolvedValue({ + allowed: true, + consumedTokens: 1, + effectiveRate: 10, + }); mockDeliverPayload.mockResolvedValue({ outcome: "transient_failure", statusCode: 503, @@ -390,13 +428,14 @@ describe("processRecords", () => { expect(mockRecordResult).toHaveBeenCalledWith( expect.anything(), "target-1", - false, + 1, + 1, expect.any(Object), ); expect(mockChangeVisibility).toHaveBeenCalled(); }); - it("does not call recordResult on rate-limited path", async () => { + it("does not call recordResult on gate admission-denied path", async () => { mockAdmit.mockResolvedValue({ allowed: false, reason: "rate_limited", @@ -409,17 +448,6 @@ describe("processRecords", () => { expect(mockRecordResult).not.toHaveBeenCalled(); }); - it("does not call recordResult on 429 path", async () => { - mockDeliverPayload.mockResolvedValue({ - outcome: "rate_limited", - retryAfterHeader: "60", - }); - - await processRecords([makeRecord()]); - - expect(mockRecordResult).not.toHaveBeenCalled(); - }); - it("does not call recordResult when CB is disabled on transient failure", async () => { const targetNoCb = { ...DEFAULT_TARGET, @@ -449,7 +477,7 @@ describe("processRecords", () => { expect(mockRecordResult).not.toHaveBeenCalled(); }); - it("emits CircuitBreakerOpen metric when recordResult returns opened", async () => { + it("records CircuitBreakerOpen when recordResult indicates circuit opened", async () => { const targetCb = { ...DEFAULT_TARGET, delivery: { circuitBreaker: { enabled: true } }, @@ -459,18 +487,21 @@ describe("processRecords", () => { outcome: "transient_failure", statusCode: 503, }); - mockRecordResult.mockResolvedValue({ ok: false, state: "opened" }); + mockRecordResult.mockResolvedValue({ + circuitState: "open", + stateChanged: true, + }); - const { emitCircuitBreakerOpen } = jest.requireMock( - "services/delivery-metrics", + const { recordCircuitBreakerOpen } = jest.requireMock( + "services/delivery-observability", ); await processRecords([makeRecord()]); - expect(emitCircuitBreakerOpen).toHaveBeenCalledWith("target-1"); + expect(recordCircuitBreakerOpen).toHaveBeenCalledWith("target-1"); }); - it("does not emit CircuitBreakerOpen when recordResult returns failed", async () => { + it("does not record CircuitBreakerOpen when recordResult has no state change", async () => { const targetCb = { ...DEFAULT_TARGET, delivery: { circuitBreaker: { enabled: true } }, @@ -480,18 +511,21 @@ describe("processRecords", () => { outcome: "transient_failure", statusCode: 503, }); - mockRecordResult.mockResolvedValue({ ok: false, state: "failed" }); + mockRecordResult.mockResolvedValue({ + circuitState: "open", + stateChanged: false, + }); - const { emitCircuitBreakerOpen } = jest.requireMock( - "services/delivery-metrics", + const { recordCircuitBreakerOpen } = jest.requireMock( + "services/delivery-observability", ); await processRecords([makeRecord()]); - expect(emitCircuitBreakerOpen).not.toHaveBeenCalled(); + expect(recordCircuitBreakerOpen).not.toHaveBeenCalled(); }); - it("does not emit CircuitBreakerOpen when recordResult returns closed", async () => { + it("does not record CircuitBreakerOpen when circuit is closed", async () => { const targetCb = { ...DEFAULT_TARGET, delivery: { circuitBreaker: { enabled: true } }, @@ -501,28 +535,61 @@ describe("processRecords", () => { outcome: "transient_failure", statusCode: 503, }); - mockRecordResult.mockResolvedValue({ ok: true, state: "closed" }); + mockRecordResult.mockResolvedValue({ + circuitState: "closed", + stateChanged: false, + }); - const { emitCircuitBreakerOpen } = jest.requireMock( - "services/delivery-metrics", + const { recordCircuitBreakerOpen } = jest.requireMock( + "services/delivery-observability", ); await processRecords([makeRecord()]); - expect(emitCircuitBreakerOpen).not.toHaveBeenCalled(); + expect(recordCircuitBreakerOpen).not.toHaveBeenCalled(); }); - it("emits RateLimited metric on 429 response", async () => { + it("records CircuitBreakerClosed when recordResult indicates circuit closed", async () => { + const targetCb = { + ...DEFAULT_TARGET, + delivery: { circuitBreaker: { enabled: true } }, + }; + mockLoadTargetConfig.mockResolvedValue(targetCb); + mockDeliverPayload.mockResolvedValue({ + outcome: "success", + statusCode: 200, + }); + mockRecordResult.mockResolvedValue({ + circuitState: "closed_recovery", + stateChanged: true, + }); + + const { recordCircuitBreakerClosed } = jest.requireMock( + "services/delivery-observability", + ); + + await processRecords([makeRecord()]); + + expect(recordCircuitBreakerClosed).toHaveBeenCalledWith("target-1"); + }); + + it("records RateLimited on 429 response", async () => { mockDeliverPayload.mockResolvedValue({ outcome: "rate_limited", retryAfterHeader: "60", }); - const { emitRateLimited } = jest.requireMock("services/delivery-metrics"); + const { recordDeliveryRateLimited } = jest.requireMock( + "services/delivery-observability", + ); await processRecords([makeRecord()]); - expect(emitRateLimited).toHaveBeenCalledWith("target-1"); + expect(recordDeliveryRateLimited).toHaveBeenCalledWith( + "client-1", + "target-1", + "test-message-id", + ); }); it("uses configured maxRetryDurationSeconds when set on target", async () => { @@ -558,4 +625,30 @@ describe("processRecords", () => { 7_200_000, ); }); + + it("groups records by target and processes each batch separately", async () => { + const record1 = makeRecord({ messageId: "msg-1" }); + const record2 = makeRecord({ + messageId: "msg-2", + body: JSON.stringify({ + payload: { + data: [ + { + type: "MessageStatus", + attributes: { messageStatus: "delivered" }, + }, + ], + }, + subscriptionId: "sub-2", + targetId: "target-2", + }), + }); + + const failures = await processRecords([record1, record2]); + + expect(failures).toEqual([]); + expect(mockAdmit).toHaveBeenCalledTimes(2); + expect(mockLoadTargetConfig).toHaveBeenCalledWith("client-1", "target-1"); + expect(mockLoadTargetConfig).toHaveBeenCalledWith("client-1", "target-2"); + }); }); diff --git a/lambdas/https-client-lambda/src/__tests__/record-result-lua.test.ts b/lambdas/https-client-lambda/src/__tests__/record-result-lua.test.ts index 515f1377..00e04707 100644 --- a/lambdas/https-client-lambda/src/__tests__/record-result-lua.test.ts +++ b/lambdas/https-client-lambda/src/__tests__/record-result-lua.test.ts @@ -1,31 +1,35 @@ import recordResultLuaSrc from "services/record-result.lua"; import { createRedisStore, evalLua } from "__tests__/helpers/lua-redis-mock"; -// ARGV: [now, success, cooldownMs, decayPeriodMs, cbErrorThreshold, cbMinAttempts, cbWindowPeriodMs] -// KEYS: [cbKey] -// Returns: [ok (0|1), state] state: "closed" | "opened" | "failed" +// ARGV: [now, consumedTokens, processingFailures, cooldownPeriodMs, recoveryPeriodMs, failureThreshold, minAttempts, samplePeriodMs] +// KEYS: [epKey] +// Returns: [circuitState, stateChanged] +// circuitState: "open" | "half_open" | "closed_recovery" | "closed" +// stateChanged: 0 | 1 type RecordResultArgs = { now: number; - success: boolean; - cooldownMs: number; - decayPeriodMs: number; - cbErrorThreshold: number; - cbMinAttempts: number; - cbWindowPeriodMs: number; + consumedTokens: number; + processingFailures: number; + cooldownPeriodMs: number; + recoveryPeriodMs: number; + failureThreshold: number; + minAttempts: number; + samplePeriodMs: number; }; const defaultArgs: RecordResultArgs = { now: 1_000_000, - success: true, - cooldownMs: 60_000, - decayPeriodMs: 300_000, - cbErrorThreshold: 0.5, - cbMinAttempts: 10, - cbWindowPeriodMs: 60_000, + consumedTokens: 1, + processingFailures: 0, + cooldownPeriodMs: 120_000, + recoveryPeriodMs: 600_000, + failureThreshold: 0.3, + minAttempts: 5, + samplePeriodMs: 300_000, }; -type RecordResultResult = [number, string]; +type RecordResultResult = [string, number]; function runRecordResult( store: ReturnType, @@ -35,15 +39,16 @@ function runRecordResult( const merged = { ...defaultArgs, ...args }; return evalLua( recordResultLuaSrc, - [`cb:${targetId}`], + [`ep:${targetId}`], [ merged.now.toString(), - merged.success ? "1" : "0", - merged.cooldownMs.toString(), - merged.decayPeriodMs.toString(), - merged.cbErrorThreshold.toString(), - merged.cbMinAttempts.toString(), - merged.cbWindowPeriodMs.toString(), + merged.consumedTokens.toString(), + merged.processingFailures.toString(), + merged.cooldownPeriodMs.toString(), + merged.recoveryPeriodMs.toString(), + merged.failureThreshold.toString(), + merged.minAttempts.toString(), + merged.samplePeriodMs.toString(), ], store, ) as RecordResultResult; @@ -51,323 +56,405 @@ function runRecordResult( describe("record-result.lua", () => { describe("success recording", () => { - it("returns closed state for a successful result", () => { + it("returns closed state for a successful batch with no state change", () => { const store = createRedisStore(); - const [ok, state] = runRecordResult(store, { success: true }); + store.set( + "ep:t1", + new Map([ + ["is_open", "0"], + ["switched_at", "0"], + ["sample_till", "9999999999"], + ]), + ); + + const [circuitState, stateChanged] = runRecordResult(store, { + consumedTokens: 5, + processingFailures: 0, + }); - expect(ok).toBe(1); - expect(state).toBe("closed"); + expect(circuitState).toBe("closed"); + expect(stateChanged).toBe(0); }); - it("increments attempt count without incrementing failures", () => { + it("increments cur_attempts without incrementing cur_failures", () => { const store = createRedisStore(); - runRecordResult(store, { success: true }); + store.set( + "ep:t1", + new Map([ + ["is_open", "0"], + ["switched_at", "0"], + ["sample_till", "9999999999"], + ]), + ); + + runRecordResult(store, { consumedTokens: 3, processingFailures: 0 }); - const cbHash = store.get("cb:t1")!; - expect(cbHash.get("cb_attempts")).toBe("1"); - expect(cbHash.get("cb_failures")).toBe("0"); + const epHash = store.get("ep:t1")!; + expect(epHash.get("cur_attempts")).toBe("3"); + expect(epHash.get("cur_failures")).toBe("0"); }); }); describe("failure recording", () => { - it("increments both attempts and failures on error", () => { + it("increments both cur_attempts and cur_failures", () => { const store = createRedisStore(); - runRecordResult(store, { success: false }); - - const cbHash = store.get("cb:t1")!; - expect(cbHash.get("cb_attempts")).toBe("1"); - expect(cbHash.get("cb_failures")).toBe("1"); - }); + store.set( + "ep:t1", + new Map([ + ["is_open", "0"], + ["switched_at", "0"], + ["sample_till", "9999999999"], + ]), + ); - it("returns failed state for a single failure below threshold", () => { - const store = createRedisStore(); - const [ok, state] = runRecordResult(store, { success: false }); + runRecordResult(store, { consumedTokens: 5, processingFailures: 1 }); - expect(ok).toBe(0); - expect(state).toBe("failed"); + const epHash = store.get("ep:t1")!; + expect(epHash.get("cur_attempts")).toBe("5"); + expect(epHash.get("cur_failures")).toBe("1"); }); - it("stays closed when below error threshold", () => { + it("returns closed state for failures below threshold", () => { const store = createRedisStore(); - const now = 1_000_000; + store.set( + "ep:t1", + new Map([ + ["is_open", "0"], + ["switched_at", "0"], + ["sample_till", "9999999999"], + ]), + ); - for (let i = 0; i < 8; i++) { - runRecordResult(store, { now, success: true }); - } - for (let i = 0; i < 2; i++) { - runRecordResult(store, { now, success: false }); - } + const [circuitState, stateChanged] = runRecordResult(store, { + consumedTokens: 1, + processingFailures: 1, + }); - const [ok, state] = runRecordResult(store, { now, success: true }); - expect(ok).toBe(1); - expect(state).toBe("closed"); + expect(circuitState).toBe("closed"); + expect(stateChanged).toBe(0); }); }); - describe("circuit opening", () => { - it("opens circuit when error rate exceeds threshold", () => { + describe("recording guard — fully open", () => { + it("does not record attempts/failures when circuit is fully open", () => { const store = createRedisStore(); const now = 1_000_000; + const switchedAt = now - 10_000; - for (let i = 0; i < 4; i++) { - const [, state] = runRecordResult(store, { - now, - success: false, - cbMinAttempts: 5, - cbErrorThreshold: 0.5, - }); - expect(state).toBe("failed"); - } - - const [ok, state] = runRecordResult(store, { + store.set( + "ep:t1", + new Map([ + ["is_open", "1"], + ["switched_at", switchedAt.toString()], + ["sample_till", "9999999999"], + ["cur_attempts", "0"], + ["cur_failures", "0"], + ]), + ); + + runRecordResult(store, { now, - success: false, - cbMinAttempts: 5, - cbErrorThreshold: 0.5, + cooldownPeriodMs: 120_000, + consumedTokens: 5, + processingFailures: 3, }); - expect(ok).toBe(0); - expect(state).toBe("opened"); + + const epHash = store.get("ep:t1")!; + expect(epHash.get("cur_attempts")).toBe("0"); + expect(epHash.get("cur_failures")).toBe("0"); }); - it("does not open circuit when below minimum attempts", () => { + it("returns open when circuit is fully open and state unchanged", () => { const store = createRedisStore(); const now = 1_000_000; + const switchedAt = now - 10_000; - for (let i = 0; i < 4; i++) { - runRecordResult(store, { - now, - success: false, - cbMinAttempts: 10, - }); - } + store.set( + "ep:t1", + new Map([ + ["is_open", "1"], + ["switched_at", switchedAt.toString()], + ["sample_till", "9999999999"], + ]), + ); - const [ok, state] = runRecordResult(store, { + const [circuitState, stateChanged] = runRecordResult(store, { now, - success: false, - cbMinAttempts: 10, + cooldownPeriodMs: 120_000, + consumedTokens: 1, + processingFailures: 0, }); - expect(ok).toBe(0); - expect(state).toBe("failed"); + + expect(circuitState).toBe("open"); + expect(stateChanged).toBe(0); }); + }); - it("sets opened_until_ms with cooldown on open", () => { + describe("circuit opening", () => { + it("opens circuit when failure rate exceeds threshold", () => { const store = createRedisStore(); - const now = 1_000_000; - const cooldownMs = 30_000; - - for (let i = 0; i < 5; i++) { - runRecordResult(store, { - now, - success: false, - cbMinAttempts: 5, - cbErrorThreshold: 0.5, - cooldownMs, - }); - } - - const cbHash = store.get("cb:t1")!; - expect(Number(cbHash.get("opened_until_ms"))).toBe(now + cooldownMs); + store.set( + "ep:t1", + new Map([ + ["is_open", "0"], + ["switched_at", "0"], + ["sample_till", "9999999999"], + ]), + ); + + const [circuitState, stateChanged] = runRecordResult(store, { + consumedTokens: 5, + processingFailures: 5, + minAttempts: 5, + failureThreshold: 0.3, + }); + expect(circuitState).toBe("open"); + expect(stateChanged).toBe(1); }); - it("resets all counters on open", () => { + it("does not open circuit when below minimum attempts", () => { const store = createRedisStore(); - const now = 1_000_000; + store.set( + "ep:t1", + new Map([ + ["is_open", "0"], + ["switched_at", "0"], + ["sample_till", "9999999999"], + ]), + ); - for (let i = 0; i < 5; i++) { - runRecordResult(store, { - now, - success: false, - cbMinAttempts: 5, - cbErrorThreshold: 0.5, - }); - } - - const cbHash = store.get("cb:t1")!; - expect(cbHash.get("cb_failures")).toBe("0"); - expect(cbHash.get("cb_attempts")).toBe("0"); - expect(cbHash.get("cb_window_from")).toBe("0"); - expect(cbHash.get("cb_prev_failures")).toBe("0"); - expect(cbHash.get("cb_prev_attempts")).toBe("0"); + const [circuitState, stateChanged] = runRecordResult(store, { + consumedTokens: 3, + processingFailures: 3, + minAttempts: 5, + failureThreshold: 0.3, + }); + expect(circuitState).toBe("closed"); + expect(stateChanged).toBe(0); }); - it("does not double-trip when circuit is already open", () => { + it("sets is_open and switched_at on open", () => { const store = createRedisStore(); const now = 1_000_000; - const openedUntil = now + 60_000; - store.set( - "cb:t1", + "ep:t1", new Map([ - ["opened_until_ms", openedUntil.toString()], - ["cb_window_from", now.toString()], + ["is_open", "0"], + ["switched_at", "0"], + ["sample_till", "9999999999"], ]), ); - for (let i = 0; i < 20; i++) { - const [, state] = runRecordResult(store, { - now, - success: false, - cbMinAttempts: 5, - cbErrorThreshold: 0.5, - }); - expect(state).toBe("failed"); - } - - const cbHash = store.get("cb:t1")!; - expect(Number(cbHash.get("opened_until_ms"))).toBe(openedUntil); + runRecordResult(store, { + now, + consumedTokens: 5, + processingFailures: 5, + minAttempts: 5, + failureThreshold: 0.3, + }); + + const epHash = store.get("ep:t1")!; + expect(epHash.get("is_open")).toBe("1"); + expect(Number(epHash.get("switched_at"))).toBe(now); }); - }); - describe("two-window blended rate", () => { - it("blends previous window failures into current assessment", () => { + it("resets all counters and sets sampleTill on open", () => { const store = createRedisStore(); const now = 1_000_000; - const cbWindowPeriodMs = 60_000; - + const samplePeriodMs = 300_000; store.set( - "cb:t1", + "ep:t1", new Map([ - ["cb_window_from", now.toString()], - ["cb_prev_failures", "8"], - ["cb_prev_attempts", "10"], + ["is_open", "0"], + ["switched_at", "0"], + ["sample_till", "9999999999"], ]), ); - const [ok, state] = runRecordResult(store, { + runRecordResult(store, { now, - success: false, - cbWindowPeriodMs, - cbMinAttempts: 5, - cbErrorThreshold: 0.5, + consumedTokens: 5, + processingFailures: 5, + minAttempts: 5, + failureThreshold: 0.3, + samplePeriodMs, }); - expect(ok).toBe(0); - expect(state).toBe("opened"); + + const epHash = store.get("ep:t1")!; + expect(epHash.get("cur_failures")).toBe("0"); + expect(epHash.get("cur_attempts")).toBe("0"); + expect(epHash.get("prev_failures")).toBe("0"); + expect(epHash.get("prev_attempts")).toBe("0"); + expect(Number(epHash.get("sample_till"))).toBe(now + samplePeriodMs); }); + }); - it("reduces previous window weight as current window ages", () => { + describe("circuit closing — half-open with successes", () => { + it("closes circuit when half-open and batch has successes", () => { const store = createRedisStore(); - const cbWindowPeriodMs = 100_000; - const t0 = 1_000_000; - const nearEnd = t0 + cbWindowPeriodMs - 1; + const now = 1_000_000; + const switchedAt = now - 130_000; store.set( - "cb:t1", + "ep:t1", new Map([ - ["cb_window_from", t0.toString()], - ["cb_prev_failures", "10"], - ["cb_prev_attempts", "10"], + ["is_open", "1"], + ["switched_at", switchedAt.toString()], + ["sample_till", "9999999999"], ]), ); - for (let i = 0; i < 20; i++) { - runRecordResult(store, { - now: nearEnd, - success: true, - cbWindowPeriodMs, - cbMinAttempts: 5, - cbErrorThreshold: 0.5, - }); - } - - const [, state] = runRecordResult(store, { - now: nearEnd, - success: false, - cbWindowPeriodMs, - cbMinAttempts: 5, - cbErrorThreshold: 0.5, + const [circuitState, stateChanged] = runRecordResult(store, { + now, + cooldownPeriodMs: 120_000, + consumedTokens: 1, + processingFailures: 0, }); - expect(state).toBe("failed"); + + expect(circuitState).toBe("closed_recovery"); + expect(stateChanged).toBe(1); + + const epHash = store.get("ep:t1")!; + expect(epHash.get("is_open")).toBe("0"); + expect(Number(epHash.get("switched_at"))).toBe(now); }); - it("ignores previous window when cbWindowPeriodMs is 0", () => { + it("does not close when half-open but all attempts failed", () => { const store = createRedisStore(); const now = 1_000_000; + const switchedAt = now - 130_000; store.set( - "cb:t1", + "ep:t1", new Map([ - ["cb_window_from", now.toString()], - ["cb_prev_failures", "100"], - ["cb_prev_attempts", "100"], + ["is_open", "1"], + ["switched_at", switchedAt.toString()], + ["sample_till", "9999999999"], ]), ); - const [, state] = runRecordResult(store, { + const [circuitState, stateChanged] = runRecordResult(store, { now, - success: false, - cbWindowPeriodMs: 0, - cbMinAttempts: 5, - cbErrorThreshold: 0.5, + cooldownPeriodMs: 120_000, + consumedTokens: 1, + processingFailures: 1, }); - expect(state).toBe("failed"); + + expect(circuitState).toBe("half_open"); + expect(stateChanged).toBe(0); }); }); - describe("decay period", () => { - it("preserves opened_until_ms during active decay", () => { + describe("sliding window management", () => { + it("promotes current to previous when sampleTill expires", () => { const store = createRedisStore(); - const openedUntil = 1_060_000; - const duringDecay = openedUntil + 100_000; + const now = 1_000_000; + const samplePeriodMs = 300_000; + const sampleTill = now - 1; store.set( - "cb:t1", - new Map([["opened_until_ms", openedUntil.toString()]]), + "ep:t1", + new Map([ + ["is_open", "0"], + ["switched_at", "0"], + ["sample_till", sampleTill.toString()], + ["cur_attempts", "10"], + ["cur_failures", "3"], + ["prev_attempts", "0"], + ["prev_failures", "0"], + ]), ); - runRecordResult(store, { - now: duringDecay, - success: true, - decayPeriodMs: 300_000, - }); + runRecordResult(store, { now, samplePeriodMs, consumedTokens: 1 }); - const cbHash = store.get("cb:t1")!; - expect(Number(cbHash.get("opened_until_ms"))).toBe(openedUntil); + const epHash = store.get("ep:t1")!; + expect(epHash.get("prev_attempts")).toBe("10"); + expect(epHash.get("prev_failures")).toBe("3"); + expect(Number(epHash.get("sample_till"))).toBe( + sampleTill + samplePeriodMs, + ); }); - it("clears opened_until_ms after decay period elapses", () => { + it("complete reset when window is too old", () => { const store = createRedisStore(); - const openedUntil = 1_060_000; - const decayPeriodMs = 300_000; - const afterDecay = openedUntil + decayPeriodMs + 1; + const now = 1_000_000; + const samplePeriodMs = 300_000; + const sampleTill = now - samplePeriodMs - 1; store.set( - "cb:t1", - new Map([["opened_until_ms", openedUntil.toString()]]), + "ep:t1", + new Map([ + ["is_open", "0"], + ["switched_at", "0"], + ["sample_till", sampleTill.toString()], + ["cur_attempts", "10"], + ["cur_failures", "3"], + ["prev_attempts", "5"], + ["prev_failures", "2"], + ]), ); - runRecordResult(store, { - now: afterDecay, - success: true, - decayPeriodMs, - }); + runRecordResult(store, { now, samplePeriodMs, consumedTokens: 1 }); - const cbHash = store.get("cb:t1")!; - expect(cbHash.get("opened_until_ms")).toBe("0"); + const epHash = store.get("ep:t1")!; + expect(epHash.get("prev_attempts")).toBe("0"); + expect(epHash.get("prev_failures")).toBe("0"); + expect(Number(epHash.get("sample_till"))).toBe(now + samplePeriodMs); }); - it("clears opened_until_ms when circuit was never opened", () => { + it("interpolates using weight from sampleTill", () => { const store = createRedisStore(); + const samplePeriodMs = 300_000; const now = 1_000_000; + const sampleTill = now + samplePeriodMs; - runRecordResult(store, { now, success: true }); + store.set( + "ep:t1", + new Map([ + ["is_open", "0"], + ["switched_at", "0"], + ["sample_till", sampleTill.toString()], + ["prev_attempts", "10"], + ["prev_failures", "10"], + ]), + ); - const cbHash = store.get("cb:t1")!; - expect(cbHash.get("opened_until_ms")).toBe("0"); + // weight = (sampleTill - now) / samplePeriodMs = 1.0 + // interpolated attempts = 10 * 1.0 + 5 = 15 (>= minAttempts 5) + // interpolated failures = 10 * 1.0 + 5 = 15 + // failure rate = 15/15 = 1.0 > 0.3 → opens + const [circuitState, stateChanged] = runRecordResult(store, { + now, + samplePeriodMs, + consumedTokens: 5, + processingFailures: 5, + minAttempts: 5, + failureThreshold: 0.3, + }); + expect(circuitState).toBe("open"); + expect(stateChanged).toBe(1); }); }); describe("state persistence", () => { - it("writes all counter fields to redis", () => { + it("writes all sampling fields to redis", () => { const store = createRedisStore(); + store.set( + "ep:t1", + new Map([ + ["is_open", "0"], + ["switched_at", "0"], + ["sample_till", "9999999999"], + ]), + ); runRecordResult(store); - const cbHash = store.get("cb:t1")!; - expect(cbHash.has("opened_until_ms")).toBe(true); - expect(cbHash.has("cb_window_from")).toBe(true); - expect(cbHash.has("cb_failures")).toBe(true); - expect(cbHash.has("cb_attempts")).toBe(true); - expect(cbHash.has("cb_prev_failures")).toBe(true); - expect(cbHash.has("cb_prev_attempts")).toBe(true); + const epHash = store.get("ep:t1")!; + expect(epHash.has("cur_attempts")).toBe(true); + expect(epHash.has("cur_failures")).toBe(true); + expect(epHash.has("prev_attempts")).toBe(true); + expect(epHash.has("prev_failures")).toBe(true); + expect(epHash.has("sample_till")).toBe(true); }); }); }); diff --git a/lambdas/https-client-lambda/src/handler.ts b/lambdas/https-client-lambda/src/handler.ts index 28fcc6b9..b129fc8d 100644 --- a/lambdas/https-client-lambda/src/handler.ts +++ b/lambdas/https-client-lambda/src/handler.ts @@ -12,7 +12,6 @@ import { OUTCOME_SUCCESS, deliverPayload, } from "services/delivery/https-client"; -import type { DeliveryResult } from "services/delivery/https-client"; import { sendToDlq } from "services/dlq-sender"; import { changeVisibility } from "services/sqs-visibility"; import { @@ -26,7 +25,6 @@ import { recordResult, } from "services/endpoint-gate"; import { getRedisClient } from "services/redis-client"; -import { VisibilityManagedError } from "services/visibility-managed-error"; import { recordAdmissionDenied, recordCircuitBreakerClosed, @@ -47,13 +45,20 @@ const DEFAULT_MAX_RETRY_DURATION_MS = 7_200_000; // 2 hours const DEFAULT_CONCURRENCY_LIMIT = 5; const gateConfig: EndpointGateConfig = { - burstCapacity: Number(process.env.TOKEN_BUCKET_BURST_CAPACITY ?? "10"), - cbProbeIntervalMs: Number(process.env.CB_PROBE_INTERVAL_MS ?? "60000"), - decayPeriodMs: Number(process.env.CB_DECAY_PERIOD_MS ?? "300000"), - cbWindowPeriodMs: Number(process.env.CB_WINDOW_PERIOD_MS ?? "60000"), - cbErrorThreshold: Number(process.env.CB_ERROR_THRESHOLD ?? "0.5"), - cbMinAttempts: Number(process.env.CB_MIN_ATTEMPTS ?? "10"), - cbCooldownMs: Number(process.env.CB_COOLDOWN_MS ?? "60000"), + // Max tokens the bucket can hold — absorbs short traffic bursts without throttling (default: 2250) + burstCapacity: Number(process.env.TOKEN_BUCKET_BURST_CAPACITY ?? "2250"), + // Probe rate to test endpoint recovery when half-open (default: 1/60 req/s) + probeRateLimit: Number(process.env.CB_PROBE_RATE_LIMIT ?? String(1 / 60)), + // Linear ramp-up after circuit closes, avoids flooding a freshly recovered endpoint (default: 10 min) + recoveryPeriodMs: Number(process.env.CB_RECOVERY_PERIOD_MS ?? "600000"), + // Sliding window over which failure rates are sampled (default: 5 min) + samplePeriodMs: Number(process.env.CB_SAMPLE_PERIOD_MS ?? "300000"), + // Failure rate within the sample window that triggers circuit open (default: 30%) + failureThreshold: Number(process.env.CB_FAILURE_THRESHOLD ?? "0.3"), + // Minimum attempts in the sample window before the failure rate is evaluated (default: 5 attempts) + minAttempts: Number(process.env.CB_MIN_ATTEMPTS ?? "5"), + // Full block after circuit opens, before half-open probes begin (default: 2 min) + cooldownPeriodMs: Number(process.env.CB_COOLDOWN_PERIOD_MS ?? "120000"), }; type CallbackDeliveryMessage = { @@ -62,223 +67,257 @@ type CallbackDeliveryMessage = { targetId: string; }; -async function checkAdmission( - redis: RedisClientType, - targetId: string, - invocationRateLimit: number, - cbEnabled: boolean, - clientId: string, - record: SQSRecord, - correlationId?: string, -): Promise { - const gateResult = await admit( - redis, - targetId, - invocationRateLimit, - cbEnabled, - gateConfig, - ); +type TargetBatch = { + targetId: string; + records: SQSRecord[]; + messages: CallbackDeliveryMessage[]; +}; - if (!gateResult.allowed) { - const delaySec = Math.ceil(gateResult.retryAfterMs / 1000); - recordAdmissionDenied(clientId, targetId, gateResult.reason, correlationId); - await changeVisibility(record.receiptHandle, delaySec); - throw new VisibilityManagedError(`Admission denied: ${gateResult.reason}`); +function groupByTarget(records: SQSRecord[]): TargetBatch[] { + const groups = new Map< + string, + { records: SQSRecord[]; messages: CallbackDeliveryMessage[] } + >(); + + for (const record of records) { + const message: CallbackDeliveryMessage = JSON.parse(record.body); + const existing = groups.get(message.targetId); + if (existing) { + existing.records.push(record); + existing.messages.push(message); + } else { + groups.set(message.targetId, { records: [record], messages: [message] }); + } } -} -const OUTCOME_DELIVERED = "delivered" as const; -const OUTCOME_DLQ = "dlq" as const; -type RecordOutcome = typeof OUTCOME_DELIVERED | typeof OUTCOME_DLQ; + return [...groups.entries()].map( + ([targetId, { messages, records: recs }]) => ({ + targetId, + records: recs, + messages, + }), + ); +} -async function handleDeliveryResult( - result: DeliveryResult, +async function deliverRecord( record: SQSRecord, - redis: RedisClientType, + message: CallbackDeliveryMessage, + target: Awaited>, + applicationId: string, clientId: string, - targetId: string, - cbEnabled: boolean, - correlationId?: string, -): Promise { +): Promise<{ success: boolean }> { + const correlationId = message.payload.data[0]?.attributes?.messageId; + + const maxRetryDurationMs = + target.delivery?.maxRetryDurationSeconds === undefined + ? DEFAULT_MAX_RETRY_DURATION_MS + : target.delivery.maxRetryDurationSeconds * 1000; + + const firstReceivedMs = Number( + record.attributes.ApproximateFirstReceiveTimestamp, + ); + + if (isWindowExhausted(firstReceivedMs, maxRetryDurationMs)) { + recordRetryWindowExhausted(clientId, message.targetId, correlationId); + await sendToDlq(record.body); + return { success: true }; + } + + const agent = await buildAgent(target); + const signature = signPayload( + applicationId, + target.apiKey.headerValue, + message.payload, + ); + const payloadJson = JSON.stringify(message.payload); + + recordDeliveryAttempt(clientId, message.targetId, correlationId); + const deliveryStart = Date.now(); + const result = await deliverPayload(target, payloadJson, signature, agent); + recordDeliveryDuration(message.targetId, Date.now() - deliveryStart); + if (result.outcome === OUTCOME_SUCCESS) { - if (cbEnabled) { - const cbOutcome = await recordResult(redis, targetId, true, gateConfig); - if (cbOutcome.ok && cbOutcome.state === "closed") { - recordCircuitBreakerClosed(targetId, correlationId); - } - } - recordDeliverySuccess(clientId, targetId, correlationId); - return OUTCOME_DELIVERED; + recordDeliverySuccess(clientId, message.targetId, correlationId); + return { success: true }; } if (result.outcome === OUTCOME_PERMANENT_FAILURE) { recordDeliveryPermanentFailure( clientId, - targetId, + message.targetId, result.statusCode, result.errorCode, correlationId, ); await sendToDlq(record.body, result); - return OUTCOME_DLQ; + return { success: true }; } if (result.outcome === OUTCOME_RATE_LIMITED) { const receiveCount = Number(record.attributes.ApproximateReceiveCount); - recordDeliveryRateLimited(clientId, targetId, correlationId); + recordDeliveryRateLimited(clientId, message.targetId, correlationId); await handleRateLimitedRecord( record, clientId, - targetId, + message.targetId, result.retryAfterHeader, receiveCount, ); - return OUTCOME_DELIVERED; // unreachable — handleRateLimitedRecord always throws + return { success: true }; } const receiveCount = Number(record.attributes.ApproximateReceiveCount); const backoffSec = jitteredBackoffSeconds(receiveCount); - if (cbEnabled) { - const cbOutcome = await recordResult(redis, targetId, false, gateConfig); - if (cbOutcome.state === "opened") { - recordCircuitBreakerOpen(targetId, correlationId); - } - } recordDeliveryFailure( clientId, - targetId, + message.targetId, result.statusCode, backoffSec, receiveCount, correlationId, ); await changeVisibility(record.receiptHandle, backoffSec); - throw new VisibilityManagedError(`Transient failure: ${result.statusCode}`); + return { success: false }; } -async function processRecord( - record: SQSRecord, +async function processTargetBatch( + batch: TargetBatch, redis: RedisClientType, -): Promise { - const { CLIENT_ID } = process.env; - if (!CLIENT_ID) { - throw new Error("CLIENT_ID is required"); + clientId: string, + concurrencyLimit: number, +): Promise { + const target = await loadTargetConfig(clientId, batch.targetId); + const cbEnabled = target.delivery?.circuitBreaker?.enabled ?? false; + + const gateResult = await admit( + redis, + batch.targetId, + target.invocationRateLimit, + cbEnabled, + batch.records.length, + gateConfig, + ); + + if (!gateResult.allowed) { + const baseDelaySec = Math.ceil(gateResult.retryAfterMs / 1000); + recordAdmissionDenied(clientId, batch.targetId, gateResult.reason); + const failures: SQSBatchItemFailure[] = []; + for (const record of batch.records) { + // eslint-disable-next-line sonarjs/pseudo-random -- jitter for backoff, not security-sensitive + const jitterSec = Math.floor(Math.random() * 5); + await changeVisibility(record.receiptHandle, baseDelaySec + jitterSec); + failures.push({ itemIdentifier: record.messageId }); + } + return failures; } - const message: CallbackDeliveryMessage = JSON.parse(record.body); - const { payload, targetId } = message; - const messageId = payload.data[0]?.attributes?.messageId; + const { consumedTokens } = gateResult; + const admitted = batch.records.slice(0, consumedTokens); + const rejected = batch.records.slice(consumedTokens); + const admittedMessages = batch.messages.slice(0, consumedTokens); - logger.info("Processing delivery", { - clientId: CLIENT_ID, - targetId, - messageId, - sqsMessageId: record.messageId, - receiveCount: record.attributes.ApproximateReceiveCount, - }); + const applicationId = await getApplicationId(clientId); - const target = await loadTargetConfig(CLIENT_ID, targetId); - const maxRetryDurationMs = - target.delivery?.maxRetryDurationSeconds === undefined - ? DEFAULT_MAX_RETRY_DURATION_MS - : target.delivery.maxRetryDurationSeconds * 1000; + const failures: SQSBatchItemFailure[] = []; + let processingFailures = 0; - const firstReceivedMs = Number( - record.attributes.ApproximateFirstReceiveTimestamp, + const deliveryResults = await pMap( + admitted, + async (record, index): Promise<{ record: SQSRecord; success: boolean }> => { + try { + const outcome = await deliverRecord( + record, + admittedMessages[index], + target, + applicationId, + clientId, + ); + return { record, success: outcome.success }; + } catch (error) { + logger.error("Failed to process record", { + messageId: record.messageId, + err: error, + }); + const receiveCount = Number(record.attributes.ApproximateReceiveCount); + await changeVisibility( + record.receiptHandle, + jitteredBackoffSeconds(receiveCount), + ); + return { record, success: false }; + } + }, + { concurrency: concurrencyLimit }, ); - if (isWindowExhausted(firstReceivedMs, maxRetryDurationMs)) { - recordRetryWindowExhausted(CLIENT_ID, targetId, messageId); - await sendToDlq(record.body); - return OUTCOME_DLQ; + for (const { record, success } of deliveryResults) { + if (!success) { + processingFailures += 1; + failures.push({ itemIdentifier: record.messageId }); + } } - const applicationId = await getApplicationId(CLIENT_ID); - const cbEnabled = target.delivery?.circuitBreaker?.enabled ?? false; - - await checkAdmission( - redis, - targetId, - target.invocationRateLimit, - cbEnabled, - CLIENT_ID, - record, - messageId, - ); - - const agent = await buildAgent(target); - const signature = signPayload( - applicationId, - target.apiKey.headerValue, - payload, - ); - const payloadJson = JSON.stringify(payload); + if (cbEnabled && consumedTokens > 0) { + const cbOutcome = await recordResult( + redis, + batch.targetId, + consumedTokens, + processingFailures, + gateConfig, + ); + if (cbOutcome.stateChanged && cbOutcome.circuitState === "open") { + recordCircuitBreakerOpen(batch.targetId); + } + if ( + cbOutcome.stateChanged && + cbOutcome.circuitState === "closed_recovery" + ) { + recordCircuitBreakerClosed(batch.targetId); + } + } - recordDeliveryAttempt(CLIENT_ID, targetId, messageId); - const deliveryStart = Date.now(); - const result = await deliverPayload(target, payloadJson, signature, agent); - recordDeliveryDuration(targetId, Date.now() - deliveryStart); + for (const record of rejected) { + failures.push({ itemIdentifier: record.messageId }); + } - return handleDeliveryResult( - result, - record, - redis, - CLIENT_ID, - targetId, - cbEnabled, - messageId, - ); + return failures; } export async function processRecords( records: SQSRecord[], ): Promise { - resetMetrics(); + const { CLIENT_ID } = process.env; + if (!CLIENT_ID) { + throw new Error("CLIENT_ID is required"); + } - logger.info("Batch received", { batchSize: records.length }); + resetMetrics(); const concurrencyLimit = Number( process.env.CONCURRENCY_LIMIT ?? String(DEFAULT_CONCURRENCY_LIMIT), ); + logger.info("Batch received", { batchSize: records.length }); + const redis = await getRedisClient(); + const targetBatches = groupByTarget(records); - const results = await pMap( - records, - async (record): Promise => { - try { - return await processRecord(record, redis); - } catch (error) { - if (!(error instanceof VisibilityManagedError)) { - logger.error("Failed to process record", { - messageId: record.messageId, - err: error, - }); - const receiveCount = Number( - record.attributes.ApproximateReceiveCount, - ); - await changeVisibility( - record.receiptHandle, - jitteredBackoffSeconds(receiveCount), - ); - } - return { itemIdentifier: record.messageId }; - } - }, - { concurrency: concurrencyLimit }, - ); + const allFailures: SQSBatchItemFailure[] = []; + + for (const batch of targetBatches) { + const batchFailures = await processTargetBatch( + batch, + redis, + CLIENT_ID, + concurrencyLimit, + ); + allFailures.push(...batchFailures); + } - await flushMetrics(); - const failures = results.filter( - (r): r is SQSBatchItemFailure => typeof r === "object", - ); - const deliveredCount = results.filter((r) => r === OUTCOME_DELIVERED).length; - const dlqCount = results.filter((r) => r === OUTCOME_DLQ).length; logger.info("Batch complete", { batchSize: records.length, - deliveredCount, - dlqCount, - failureCount: failures.length, + failureCount: allFailures.length, }); - return failures; + + await flushMetrics(); + return allFailures; } diff --git a/lambdas/https-client-lambda/src/services/admit.lua b/lambdas/https-client-lambda/src/services/admit.lua index fd56decb..53b6977c 100644 --- a/lambdas/https-client-lambda/src/services/admit.lua +++ b/lambdas/https-client-lambda/src/services/admit.lua @@ -1,203 +1,108 @@ --- admit.lua — Decides whether a request to an endpoint is allowed. +-- admit.lua — Pre-processing: determines rate limit and consumes tokens. -- --- Three sequential checks run atomically: --- 1. Circuit breaker — is the endpoint currently healthy? --- 2. Sliding window — roll the two-window error-rate accounting state if needed --- 3. Token bucket — is the endpoint within its rate limit? +-- Two sequential steps run atomically: +-- 1. Circuit breaker — determine effective rate from circuit state +-- 2. Token bucket — consume tokens for the target batch -- --- A request is allowed only when all three checks pass. +-- The circuit has four states: +-- Open (during cooldown): rate = 0, complete block, bucket untouched +-- Half-open (after cooldown): rate = probeRateLimit +-- Recovering (closed, during recovery period): linear ramp-up +-- Normal (closed): full configured rate -- --- While the circuit is open, a timed probe is let through at most once per --- cbProbeIntervalMs so the caller can test whether the endpoint has recovered. --- The probe bypasses the rate limit — counting it here would skew a --- low-volume probe signal against the recovery decision. --- --- After the circuit closes, the token fill rate ramps up linearly from --- near-zero to full over decayPeriodMs to avoid a thundering herd on recovery. --- --- Returns: { allowed (0|1), reason, retryAfterMs, effectiveRate } +-- Returns: { consumedTokens, reason, retryAfterMs, effectiveRate } -- Keys -local cbKey = KEYS[1] -- cb:{endpoint} circuit breaker state hash -local rlKey = KEYS[2] -- rl:{endpoint} rate limiter state hash +local epKey = KEYS[1] -- ep:{targetId} combined endpoint state hash -- Arguments -local now = tonumber(ARGV[1]) or 0 -- current wall-clock time (ms) -local capacity = tonumber(ARGV[2]) or 0 -- token bucket maximum capacity -local refillPerSec = tonumber(ARGV[3]) or 0 -- full token fill rate (tokens/sec) -local cooldownMs = tonumber(ARGV[4]) or 0 -- how long the circuit stays open (ms) -local decayPeriodMs = tonumber(ARGV[5]) or 0 -- ramp-up window after circuit closes (ms) -local cbWindowPeriodMs = tonumber(ARGV[6]) or 0 -- error-rate sliding window duration (ms) -local cbProbeIntervalMs = tonumber(ARGV[7]) or 0 -- minimum gap between probe requests (ms; 0 = no probes) - --- TTL policy: circuit breaker state must outlive the cooldown window so that --- the ramp-up period remains visible to subsequent calls after a close. --- Rate limiter state needs only a short idle window. -local cbTtlSeconds = math.ceil(cooldownMs / 1000) + 60 -local rlTtlSeconds = 120 +local now = tonumber(ARGV[1]) or 0 +local capacity = tonumber(ARGV[2]) or 0 +local targetRateLimit = tonumber(ARGV[3]) or 0 +local cooldownMs = tonumber(ARGV[4]) or 0 +local recoveryPeriodMs = tonumber(ARGV[5]) or 0 +local probeRateLimit = tonumber(ARGV[6]) or 0 +local targetBatchSize = tonumber(ARGV[7]) or 0 -------------------------------------------------------------------------------- -- LOAD STATE -------------------------------------------------------------------------------- -local cb = redis.call("HMGET", cbKey, - "opened_until_ms", "cb_window_from", "cb_failures", "cb_attempts", "last_probe_ms", - "cb_prev_failures", "cb_prev_attempts") -local openedUntil = tonumber(cb[1] or "0") -local cbWindowFrom = tonumber(cb[2] or "0") -local cbFailures = tonumber(cb[3] or "0") -local cbAttempts = tonumber(cb[4] or "0") -local lastProbeMs = tonumber(cb[5] or "0") -local cbPrevFailures = tonumber(cb[6] or "0") -local cbPrevAttempts = tonumber(cb[7] or "0") - -local rl = redis.call("HMGET", rlKey, "tokens", "last_refill_ms") -local tokens = tonumber(rl[1] or capacity) -local lastRefill = tonumber(rl[2] or now) +local state = redis.call("HMGET", epKey, + "is_open", "switched_at", "bucket_tokens", "bucket_refilled_at") +local isOpenRaw = state[1] +local needInit = isOpenRaw == false or isOpenRaw == nil +local isOpen = needInit or tonumber(isOpenRaw) == 1 +local switchedAt = needInit and 0 or tonumber(state[2] or "0") +local bucketTokens = tonumber(state[3] or "0") +local bucketRefilledAt = needInit and now or tonumber(state[4] or "0") -------------------------------------------------------------------------------- --- 1. CIRCUIT BREAKER --- --- The circuit is open when openedUntil is set and has not yet elapsed. --- All requests are rejected while open to give the endpoint time to recover. --- --- Timed probes: once per cbProbeIntervalMs a single request is allowed --- through even while the circuit is open. The caller must record the --- outcome via record-result.lua; a successful probe will close the circuit --- and trigger the ramp-up phase. +-- 1. CIRCUIT BREAKER — determine effective rate -------------------------------------------------------------------------------- -if openedUntil > 0 and now < openedUntil then - -- Allow a probe through if the probe interval has elapsed - if cbProbeIntervalMs > 0 and (now - lastProbeMs) >= cbProbeIntervalMs then - lastProbeMs = now - redis.call("HSET", cbKey, - "opened_until_ms", openedUntil, - "cb_window_from", cbWindowFrom, - "cb_failures", cbFailures, - "cb_attempts", cbAttempts, - "last_probe_ms", lastProbeMs, - "cb_prev_failures", cbPrevFailures, - "cb_prev_attempts", cbPrevAttempts - ) - redis.call("EXPIRE", cbKey, cbTtlSeconds) - return { 1, "probe", 0, 0 } - end - - -- Circuit is open and no probe slot is available — reject - return { 0, "circuit_open", openedUntil - now, 0 } -end +local isHalfOpen = isOpen and now > switchedAt + cooldownMs +local isRecovering = (not isOpen) and now < switchedAt + recoveryPeriodMs --------------------------------------------------------------------------------- --- 2. SLIDING WINDOW --- --- Two windows (current + previous) together approximate a sliding window over --- cbWindowPeriodMs. When the current window expires it is promoted to previous --- and a fresh current window starts. record-result.lua blends the two windows --- using a time-based weight to smooth the error rate across the boundary rather --- than resetting it to zero at expiry. --- --- record-result.lua is responsible for incrementing the counters; this script --- is only responsible for rolling the window boundary forward when it expires. --------------------------------------------------------------------------------- +local effectiveRate -if cbWindowFrom == 0 then - -- No window exists yet — start one now - cbWindowFrom = now -elseif (now - cbWindowFrom) > cbWindowPeriodMs then - -- Current window has expired — roll it forward - if (now - cbWindowFrom) > (2 * cbWindowPeriodMs) then - -- Both current and previous windows are stale: a long quiet period means - -- old failure counts are no longer relevant to the health of the endpoint. - cbPrevFailures = 0 - cbPrevAttempts = 0 +if isOpen then + if isHalfOpen then + effectiveRate = probeRateLimit + else + return { 0, "circuit_open", (switchedAt + cooldownMs) - now, 0 } + end +else + if isRecovering then + local rampRange = math.max(0, targetRateLimit - probeRateLimit) + local rampProgress = math.max(0, now - switchedAt) / recoveryPeriodMs + effectiveRate = probeRateLimit + rampProgress * rampRange else - -- Promote current → previous so it can be blended with the new current window - cbPrevFailures = cbFailures - cbPrevAttempts = cbAttempts + effectiveRate = targetRateLimit end - cbFailures = 0 - cbAttempts = 0 - cbWindowFrom = now end -------------------------------------------------------------------------------- --- 3. TOKEN BUCKET +-- 2. TOKEN BUCKET — batch consumption -- --- Refills tokens based on elapsed time, then tries to consume one. --- If no tokens are available the request is rate-limited. +-- Generate tokens based on elapsed time, then consume as many as needed for +-- the batch, up to the number available. -- --- Ramp-up: after the circuit closes (openedUntil is set but in the past), --- effectiveRate scales linearly from near-zero to the full refillPerSec over --- decayPeriodMs. This deliberately slows recovery traffic so a flapping --- endpoint is not immediately overwhelmed. --- Once decayPeriodMs elapses, openedUntil is cleared and the full rate resumes. +-- bucketRefilledAt tracks the point in time up to which tokens have been +-- generated. We advance it by exactly the time needed to produce the whole +-- tokens we generated (generationTime), rather than setting it to `now`. +-- +-- Why not `now`? Token generation uses floor(), so any sub-token fractional +-- time is truncated. Setting bucketRefilledAt = now would discard that +-- remainder, meaning the next call starts its elapsed-time calculation from +-- a later point than it should. Over many calls this causes token leakage — +-- the bucket refills slower than the configured rate. By advancing only by +-- generationTime, the leftover fractional time carries over to the next call. -------------------------------------------------------------------------------- -local effectiveRate = refillPerSec - -if openedUntil > 0 and now > openedUntil and decayPeriodMs > 0 then - -- Circuit has recently closed — apply linear ramp-up - local sinceClose = now - openedUntil - if sinceClose >= decayPeriodMs then - -- Decay period fully elapsed — restore full rate and clear the CB timestamp - openedUntil = 0 - else - -- Still within decay period — scale fill rate proportionally to time elapsed - local fraction = sinceClose / decayPeriodMs - effectiveRate = math.max(1, math.floor(refillPerSec * fraction)) - end +if isOpen then + bucketTokens = 0 end --- Refill tokens based on time elapsed since last refill -local elapsed = now - lastRefill -if elapsed > 0 then - local refill = math.floor((elapsed * effectiveRate) / 1000) - if refill > 0 then - tokens = math.min(capacity, tokens + refill) - lastRefill = now - end -end +local generatedTokens = math.floor((now - bucketRefilledAt) * effectiveRate / 1000) +local availTokens = math.min(capacity, bucketTokens + generatedTokens) +local consumedTokens = math.min(targetBatchSize, availTokens) --- Not enough tokens — rate-limited --- TTL is intentionally not refreshed here; it was set on the last allowed call. -if tokens < 1 then - redis.call("HSET", cbKey, - "opened_until_ms", openedUntil, - "cb_window_from", cbWindowFrom, - "cb_failures", cbFailures, - "cb_attempts", cbAttempts, - "cb_prev_failures", cbPrevFailures, - "cb_prev_attempts", cbPrevAttempts - ) - redis.call("HSET", rlKey, - "tokens", tokens, - "last_refill_ms", lastRefill - ) - return { 0, "rate_limited", 1000, effectiveRate } +bucketTokens = availTokens - consumedTokens +if generatedTokens > 0 and effectiveRate > 0 then + local generationTime = generatedTokens * 1000 / effectiveRate + bucketRefilledAt = bucketRefilledAt + generationTime end --- Consume one token -tokens = tokens - 1 - -------------------------------------------------------------------------------- --- 4. PERSIST STATE AND ALLOW +-- 3. PERSIST STATE AND RETURN -------------------------------------------------------------------------------- -redis.call("HSET", cbKey, - "opened_until_ms", openedUntil, - "cb_window_from", cbWindowFrom, - "cb_failures", cbFailures, - "cb_attempts", cbAttempts, - "cb_prev_failures", cbPrevFailures, - "cb_prev_attempts", cbPrevAttempts -) -redis.call("HSET", rlKey, - "tokens", tokens, - "last_refill_ms", lastRefill +redis.call("HSET", epKey, + "bucket_tokens", bucketTokens, + "bucket_refilled_at", bucketRefilledAt ) -redis.call("EXPIRE", cbKey, cbTtlSeconds) -redis.call("EXPIRE", rlKey, rlTtlSeconds) - -return { 1, "allowed", 0, effectiveRate } +local reason = consumedTokens < 1 and "rate_limited" or "allowed" +local retryAfter = consumedTokens < 1 and 1000 or 0 +return { consumedTokens, reason, retryAfter, effectiveRate } diff --git a/lambdas/https-client-lambda/src/services/endpoint-gate.ts b/lambdas/https-client-lambda/src/services/endpoint-gate.ts index c2d85439..8870e5d8 100644 --- a/lambdas/https-client-lambda/src/services/endpoint-gate.ts +++ b/lambdas/https-client-lambda/src/services/endpoint-gate.ts @@ -5,7 +5,7 @@ import recordResultLuaSrc from "services/record-result.lua"; export type AdmitResultAllowed = { allowed: true; - probe: boolean; + consumedTokens: number; effectiveRate: number; }; @@ -18,18 +18,21 @@ export type AdmitResultDenied = { export type AdmitResult = AdmitResultAllowed | AdmitResultDenied; -export type RecordResultOutcome = - | { ok: true; state: "closed" } - | { ok: false; state: "opened" | "failed" }; +export type CircuitState = "open" | "half_open" | "closed_recovery" | "closed"; + +export type RecordResultOutcome = { + circuitState: CircuitState; + stateChanged: boolean; +}; export type EndpointGateConfig = { burstCapacity: number; - cbProbeIntervalMs: number; - decayPeriodMs: number; - cbWindowPeriodMs: number; - cbErrorThreshold: number; - cbMinAttempts: number; - cbCooldownMs: number; + probeRateLimit: number; + recoveryPeriodMs: number; + samplePeriodMs: number; + failureThreshold: number; + minAttempts: number; + cooldownPeriodMs: number; }; let admitSha: string | undefined; @@ -76,22 +79,21 @@ export async function admit( targetId: string, refillPerSec: number, cbEnabled: boolean, + targetBatchSize: number, config: EndpointGateConfig, ): Promise { - const cbKey = `cb:{${targetId}}`; - const rlKey = `rl:{${targetId}}`; + const epKey = `ep:{${targetId}}`; const now = Date.now().toString(); - const probeIntervalMs = cbEnabled ? config.cbProbeIntervalMs.toString() : "0"; + const probeRate = cbEnabled ? config.probeRateLimit.toString() : "0"; const args = [ now, config.burstCapacity.toString(), - // eslint-disable-next-line sonarjs/null-dereference - refillPerSec.toString(), - config.cbCooldownMs.toString(), - config.decayPeriodMs.toString(), - config.cbWindowPeriodMs.toString(), - probeIntervalMs, + String(refillPerSec), + config.cooldownPeriodMs.toString(), + config.recoveryPeriodMs.toString(), + probeRate, + String(targetBatchSize), ]; if (!admitSha) { @@ -102,16 +104,16 @@ export async function admit( client, admitLuaSrc, admitSha, - [cbKey, rlKey], + [epKey], args, )) as [number, string, number, number]; - const [allowed, reason, retryAfterMs, effectiveRate] = raw; + const [consumedOrFlag, reason, retryAfterMs, effectiveRate] = raw; - if (allowed === 1) { + if (reason === "allowed" || reason === "probe") { return { allowed: true, - probe: reason === "probe", + consumedTokens: Number(consumedOrFlag), effectiveRate: Number(effectiveRate), }; } @@ -127,20 +129,22 @@ export async function admit( export async function recordResult( client: RedisClientType, targetId: string, - success: boolean, + consumedTokens: number, + processingFailures: number, config: EndpointGateConfig, ): Promise { - const cbKey = `cb:{${targetId}}`; + const epKey = `ep:{${targetId}}`; const now = Date.now().toString(); const args = [ now, - success ? "1" : "0", - config.cbCooldownMs.toString(), - config.decayPeriodMs.toString(), - config.cbErrorThreshold.toString(), - config.cbMinAttempts.toString(), - config.cbWindowPeriodMs.toString(), + String(consumedTokens), + String(processingFailures), + config.cooldownPeriodMs.toString(), + config.recoveryPeriodMs.toString(), + config.failureThreshold.toString(), + config.minAttempts.toString(), + config.samplePeriodMs.toString(), ]; if (!recordResultSha) { @@ -151,17 +155,16 @@ export async function recordResult( client, recordResultLuaSrc, recordResultSha, - [cbKey], + [epKey], args, - )) as [number, string]; - - const [ok, state] = raw; + )) as [string, number]; - if (ok === 1) { - return { ok: true, state: "closed" }; - } + const [circuitState, stateChanged] = raw; - return { ok: false, state: state as "opened" | "failed" }; + return { + circuitState: circuitState as CircuitState, + stateChanged: stateChanged === 1, + }; } export function resetAdmitSha(): void { diff --git a/lambdas/https-client-lambda/src/services/record-result.lua b/lambdas/https-client-lambda/src/services/record-result.lua index 1cc94857..fa42efea 100644 --- a/lambdas/https-client-lambda/src/services/record-result.lua +++ b/lambdas/https-client-lambda/src/services/record-result.lua @@ -1,144 +1,169 @@ --- record-result.lua — Records the outcome of a delivery attempt. +-- record-result.lua — Post-processing: updates sampling and circuit breaker. -- --- Updates the circuit breaker's error-rate window counters and opens the --- circuit if the failure rate exceeds the configured threshold. +-- After processing a batch, this script: +-- 1. Manages the sliding window (rolling forward as necessary) +-- 2. Records new attempts and failures (unless fully open) +-- 3. Interpolates attempt/failure rates using the sliding window +-- 4. Checks whether to close the circuit (half-open + successes) +-- 5. Checks whether to open the circuit (closed + threshold exceeded) -- --- On success: --- Window counters are left intact. The openedUntil timestamp is preserved --- while the decay period is still active so that admit.lua can continue --- computing the linear ramp-up rate. Once the decay period elapses it --- is zeroed, returning the circuit to a fully clean closed state. +-- Returns: { circuitState, stateChanged } -- --- On failure: --- The failure and attempt counters are incremented. A two-window sliding --- blend is computed before evaluating the trip condition: --- slidingAttempts = cbAttempts + cbPrevAttempts * prevWeight --- slidingFailures = cbFailures + cbPrevFailures * prevWeight --- where prevWeight decays linearly from 1.0 → 0.0 as the current window ages, --- so previous-window failures fade out gradually rather than dropping off a cliff. --- The circuit opens when: --- • the endpoint is not already open (prevents double-tripping and --- resetting the cooldown timer prematurely), AND --- • slidingAttempts >= cbMinAttempts (avoids tripping on statistically --- insignificant data at cold start or just after a window roll), AND --- • slidingFailures / slidingAttempts exceeds cbErrorThreshold. --- On open, all counters (current and previous) are reset to zero so the --- fresh cooldown window begins with a clean slate ready for recovery. +-- circuitState: the current state of the circuit after this run +-- "open" — fully open (during cooldown, no probes) +-- "half_open" — open but past cooldown (probing) +-- "closed_recovery" — closed but ramping up (recovery period) +-- "closed" — closed, running at full rate -- --- Returns: { ok (0|1), state } --- state: "closed" | "opened" | "failed" +-- stateChanged: whether a circuit transition occurred this run +-- 1 — the circuit opened or closed during this execution +-- 0 — no state transition + +-- Circuit state constants +local OPEN = "open" +local HALF_OPEN = "half_open" +local CLOSED_RECOVERY = "closed_recovery" +local CLOSED = "closed" -- Keys -local cbKey = KEYS[1] -- cb:{endpoint} circuit breaker state hash +local epKey = KEYS[1] -- ep:{targetId} combined endpoint state hash -- Arguments -local now = tonumber(ARGV[1]) or 0 -- current wall-clock time (ms) -local success = tonumber(ARGV[2]) or 0 -- 1 = success, 0 = failure -local cooldownMs = tonumber(ARGV[3]) or 0 -- how long the circuit stays open (ms) -local decayPeriodMs = tonumber(ARGV[4]) or 0 -- ramp-up window after circuit closes (ms) -local cbErrorThreshold = tonumber(ARGV[5]) or 0 -- error-rate fraction that trips the circuit (e.g. 0.5) -local cbMinAttempts = tonumber(ARGV[6]) or 0 -- minimum samples before the circuit can trip -local cbWindowPeriodMs = tonumber(ARGV[7]) or 0 -- error-rate sliding window duration (ms) - --- TTL policy: keep circuit breaker state alive for at least the cooldown --- duration plus a buffer so the decay period remains visible after a close. -local cbTtlSeconds = math.ceil(cooldownMs / 1000) + 60 - -local function refreshCbExpiry() - redis.call("EXPIRE", cbKey, cbTtlSeconds) -end +local now = tonumber(ARGV[1]) or 0 +local consumedTokens = tonumber(ARGV[2]) or 0 +local processingFailures = tonumber(ARGV[3]) or 0 +local cooldownPeriodMs = tonumber(ARGV[4]) or 0 +local recoveryPeriodMs = tonumber(ARGV[5]) or 0 +local failureThreshold = tonumber(ARGV[6]) or 0 +local minAttempts = tonumber(ARGV[7]) or 0 +local samplePeriodMs = tonumber(ARGV[8]) or 0 -------------------------------------------------------------------------------- -- LOAD CURRENT STATE -------------------------------------------------------------------------------- -local cb = redis.call("HMGET", cbKey, - "opened_until_ms", "cb_window_from", "cb_failures", "cb_attempts", - "cb_prev_failures", "cb_prev_attempts") -local openedUntil = tonumber(cb[1] or "0") -local cbWindowFrom = tonumber(cb[2] or "0") -local cbFailures = tonumber(cb[3] or "0") -local cbAttempts = tonumber(cb[4] or "0") -local cbPrevFailures = tonumber(cb[5] or "0") -local cbPrevAttempts = tonumber(cb[6] or "0") +local state = redis.call("HMGET", epKey, + "is_open", "switched_at", + "cur_attempts", "prev_attempts", "cur_failures", "prev_failures", + "sample_till") +local isOpenRaw = state[1] +local needInit = isOpenRaw == false or isOpenRaw == nil +local isOpen = needInit or tonumber(isOpenRaw) == 1 +local switchedAt = needInit and 0 or tonumber(state[2] or "0") +local curAttempts = tonumber(state[3] or "0") +local prevAttempts = tonumber(state[4] or "0") +local curFailures = tonumber(state[5] or "0") +local prevFailures = tonumber(state[6] or "0") +local sampleTill = tonumber(state[7] or "0") + +-------------------------------------------------------------------------------- +-- 1. DETERMINE CIRCUIT SUB-STATE +-------------------------------------------------------------------------------- --- Every outcome (success or failure) contributes to the error-rate window -cbAttempts = cbAttempts + 1 +local isHalfOpen = isOpen and now > switchedAt + cooldownPeriodMs +local isFullyOpen = isOpen and not isHalfOpen -------------------------------------------------------------------------------- --- SUCCESS — preserve openedUntil during decay, then zero it --- --- admit.lua uses openedUntil to calculate the linear ramp-up rate while the --- decay period is active. That timestamp must survive in Redis until the --- decay period ends. Clearing it prematurely would snap the fill rate back --- to full immediately rather than ramping gradually. --------------------------------------------------------------------------------- - -if success == 1 then - -- Keep openedUntil only if we are still within the decay window - local inDecayWindow = openedUntil > 0 and now > openedUntil and (now - openedUntil) < decayPeriodMs - local preservedOpenedUntil = inDecayWindow and openedUntil or 0 - - redis.call("HSET", cbKey, - "opened_until_ms", preservedOpenedUntil, - "cb_window_from", cbWindowFrom, - "cb_failures", cbFailures, - "cb_attempts", cbAttempts, - "cb_prev_failures", cbPrevFailures, - "cb_prev_attempts", cbPrevAttempts - ) - refreshCbExpiry() - return { 1, "closed" } +-- 2. MANAGE SLIDING WINDOW +-------------------------------------------------------------------------------- + +if sampleTill < now then + if sampleTill + samplePeriodMs < now then + -- Complete reset — window is too old + prevAttempts = 0 + prevFailures = 0 + sampleTill = now + samplePeriodMs + else + -- Promote current to previous + prevAttempts = curAttempts + prevFailures = curFailures + sampleTill = sampleTill + samplePeriodMs + end + curAttempts = 0 + curFailures = 0 end -------------------------------------------------------------------------------- --- FAILURE — increment counter and evaluate whether to open the circuit --- --- The trip condition is evaluated against a sliding blend of current and --- previous window counts, not the raw current-window counts alone. This --- prevents a burst of failures from escaping detection simply because it --- straddles a window boundary and gets partially discarded by a reset. --------------------------------------------------------------------------------- - -cbFailures = cbFailures + 1 - --- The circuit is already open when openedUntil is set and has not yet elapsed. --- Guard against double-tripping, which would reset the cooldown timer early. -local circuitAlreadyOpen = openedUntil > 0 and now < openedUntil - --- Blend current and previous window counts. --- prevWeight decays linearly from 1.0 → 0.0 as the current window ages, --- so previous-window failures fade out gradually rather than dropping off a cliff. -local windowElapsed = cbWindowFrom > 0 and (now - cbWindowFrom) or 0 -local hasWindow = cbWindowPeriodMs > 0 -local prevWeight = hasWindow and math.max(0, (cbWindowPeriodMs - windowElapsed) / cbWindowPeriodMs) or 0 -local slidingFailures = cbFailures + cbPrevFailures * prevWeight -local slidingAttempts = cbAttempts + cbPrevAttempts * prevWeight - -if not circuitAlreadyOpen - and slidingAttempts >= cbMinAttempts -- enough data to be statistically meaningful - and (slidingFailures / slidingAttempts) > cbErrorThreshold then - -- Trip the circuit — reset all counters so recovery starts from a clean slate - redis.call("HSET", cbKey, - "opened_until_ms", now + cooldownMs, - "cb_window_from", 0, - "cb_failures", 0, - "cb_attempts", 0, - "cb_prev_failures", 0, - "cb_prev_attempts", 0 - ) - refreshCbExpiry() - return { 0, "opened" } +-- 3. RECORD NEW ATTEMPTS/FAILURES (unless fully open) +-------------------------------------------------------------------------------- + +if not isFullyOpen then + curAttempts = curAttempts + consumedTokens + curFailures = curFailures + processingFailures end --- Below the threshold — record the failure but keep the circuit closed -redis.call("HSET", cbKey, - "opened_until_ms", openedUntil, - "cb_window_from", cbWindowFrom, - "cb_failures", cbFailures, - "cb_attempts", cbAttempts, - "cb_prev_failures", cbPrevFailures, - "cb_prev_attempts", cbPrevAttempts +-------------------------------------------------------------------------------- +-- 4. INTERPOLATE VALUES +-------------------------------------------------------------------------------- + +local weight = (sampleTill - now) / samplePeriodMs +local attempts = prevAttempts * weight + curAttempts +local failures = prevFailures * weight + curFailures + +-------------------------------------------------------------------------------- +-- 5. CIRCUIT BREAKER LOGIC +-------------------------------------------------------------------------------- + +local processingSuccesses = consumedTokens - processingFailures +local stateChanged = false + +-- Close circuit when half-open and there are successes +if isHalfOpen and processingSuccesses > 0 then + isOpen = false + switchedAt = now + stateChanged = true + -- fall through, allow circuit to immediately re-open +end + +-- Open circuit when closed, enough samples, and threshold exceeded +local hasSampledEnough = attempts >= minAttempts +if not isOpen and hasSampledEnough and (failures / attempts) > failureThreshold then + isOpen = true + switchedAt = now + curAttempts = 0 + curFailures = 0 + prevAttempts = 0 + prevFailures = 0 + sampleTill = now + samplePeriodMs + stateChanged = true +end + +-------------------------------------------------------------------------------- +-- 6. DETERMINE CURRENT CIRCUIT STATE FOR REPORTING +-------------------------------------------------------------------------------- + +local circuitState +if isOpen then + if now > switchedAt + cooldownPeriodMs then + circuitState = HALF_OPEN + else + circuitState = OPEN + end +else + if now < switchedAt + recoveryPeriodMs then + circuitState = CLOSED_RECOVERY + else + circuitState = CLOSED + end +end + +-------------------------------------------------------------------------------- +-- 7. PERSIST STATE +-------------------------------------------------------------------------------- + +redis.call("HSET", epKey, + "cur_attempts", curAttempts, + "prev_attempts", prevAttempts, + "cur_failures", curFailures, + "prev_failures", prevFailures, + "sample_till", sampleTill ) -refreshCbExpiry() -return { 0, "failed" } + +if stateChanged then + redis.call("HSET", epKey, + "is_open", isOpen and 1 or 0, + "switched_at", switchedAt + ) +end + +return { circuitState, stateChanged and 1 or 0 } diff --git a/lambdas/perf-runner-lambda/package.json b/lambdas/perf-runner-lambda/package.json index 9f9d01d8..59d7691b 100644 --- a/lambdas/perf-runner-lambda/package.json +++ b/lambdas/perf-runner-lambda/package.json @@ -13,13 +13,17 @@ "typecheck": "tsc --noEmit" }, "dependencies": { + "@aws-crypto/sha256-js": "catalog:aws", "@aws-sdk/client-cloudwatch-logs": "catalog:aws", "@aws-sdk/client-sqs": "catalog:aws", + "@aws-sdk/credential-providers": "catalog:aws", + "@smithy/signature-v4": "catalog:aws", "@nhs-notify-client-callbacks/logger": "workspace:*", "@nhs-notify-client-callbacks/models": "workspace:*", - "esbuild": "catalog:tools" + "@redis/client": "catalog:app" }, "devDependencies": { + "esbuild": "catalog:tools", "@tsconfig/node22": "catalog:tools", "@types/aws-lambda": "catalog:tools", "@types/jest": "catalog:test", diff --git a/lambdas/perf-runner-lambda/src/__tests__/cloudwatch.test.ts b/lambdas/perf-runner-lambda/src/__tests__/cloudwatch.test.ts index 055ac7bc..526de638 100644 --- a/lambdas/perf-runner-lambda/src/__tests__/cloudwatch.test.ts +++ b/lambdas/perf-runner-lambda/src/__tests__/cloudwatch.test.ts @@ -1,5 +1,10 @@ import type { CloudWatchLogsClient } from "@aws-sdk/client-cloudwatch-logs"; -import { queryDeliveryMetricsSnapshot, queryMetricsSnapshot } from "cloudwatch"; +import { + queryCircuitBreakerSnapshot, + queryDeliveryMetricsSnapshot, + queryMetricsSnapshot, + queryPerClientRateTimeline, +} from "cloudwatch"; const mockCloudWatchClient = { send: jest.fn(), @@ -285,3 +290,373 @@ describe("queryDeliveryMetricsSnapshot", () => { expect(result).toBeNull(); }); }); + +describe("queryCircuitBreakerSnapshot", () => { + it("returns null when logGroupNames is empty", async () => { + const result = await queryCircuitBreakerSnapshot( + mockCloudWatchClient, + [], + 0, + 60, + ); + + expect(result).toBeNull(); + expect(mockCloudWatchClient.send).not.toHaveBeenCalled(); + }); + + it("returns null when StartQuery returns no queryId", async () => { + mockCloudWatchClient.send.mockResolvedValueOnce({} as never); + + const result = await queryCircuitBreakerSnapshot( + mockCloudWatchClient, + ["/aws/lambda/test-https-client-perf-client-1"], + 0, + 60, + ); + + expect(result).toBeNull(); + }); + + it("returns a snapshot with zeroed metrics when the result row is empty", async () => { + mockCloudWatchClient.send + .mockResolvedValueOnce({ queryId: "qid-cb1" } as never) + .mockResolvedValueOnce({ status: "Complete", results: [] } as never); + + const promise = queryCircuitBreakerSnapshot( + mockCloudWatchClient, + ["/aws/lambda/test-https-client-perf-client-1"], + 100, + 160, + ); + + await jest.runAllTimersAsync(); + const result = await promise; + + expect(result).toMatchObject({ + intervalStartSec: 100, + intervalEndSec: 160, + circuitOpenEvents: 0, + circuitCloseEvents: 0, + admissionDeniedCircuitOpen: 0, + admissionDeniedRateLimited: 0, + deliveryAttempts: 0, + deliverySuccesses: 0, + deliveryFailures: 0, + deliveryRateLimited: 0, + }); + expect(result?.snapshotAt).toBeGreaterThan(0); + }); + + it("returns a populated snapshot when query completes successfully", async () => { + mockCloudWatchClient.send + .mockResolvedValueOnce({ queryId: "qid-cb2" } as never) + .mockResolvedValueOnce({ + status: "Complete", + results: [ + [ + { field: "circuitOpenEvents", value: "3" }, + { field: "circuitCloseEvents", value: "2" }, + { field: "admissionDeniedCircuitOpen", value: "15" }, + { field: "admissionDeniedRateLimited", value: "8" }, + { field: "deliveryAttempts", value: "200" }, + { field: "deliverySuccesses", value: "180" }, + { field: "deliveryFailures", value: "12" }, + { field: "deliveryRateLimited", value: "8" }, + ], + ], + } as never); + + const promise = queryCircuitBreakerSnapshot( + mockCloudWatchClient, + ["/aws/lambda/test-https-client-perf-client-1"], + 100, + 160, + ); + + await jest.runAllTimersAsync(); + const result = await promise; + + expect(result).toMatchObject({ + intervalStartSec: 100, + intervalEndSec: 160, + circuitOpenEvents: 3, + circuitCloseEvents: 2, + admissionDeniedCircuitOpen: 15, + admissionDeniedRateLimited: 8, + deliveryAttempts: 200, + deliverySuccesses: 180, + deliveryFailures: 12, + deliveryRateLimited: 8, + }); + }); + + it("sends logGroupNames to StartQuery", async () => { + mockCloudWatchClient.send + .mockResolvedValueOnce({ queryId: "qid-cb3" } as never) + .mockResolvedValueOnce({ status: "Complete", results: [] } as never); + + const logGroups = [ + "/aws/lambda/test-https-client-perf-client-1", + "/aws/lambda/test-https-client-perf-client-2", + ]; + + const promise = queryCircuitBreakerSnapshot( + mockCloudWatchClient, + logGroups, + 0, + 60, + ); + + await jest.runAllTimersAsync(); + await promise; + + const startCmd = mockCloudWatchClient.send.mock.calls[0][0] as { + input: { logGroupNames: string[] }; + }; + expect(startCmd.input.logGroupNames).toEqual(logGroups); + }); + + it("returns null when the query status is Failed", async () => { + mockCloudWatchClient.send + .mockResolvedValueOnce({ queryId: "qid-cb4" } as never) + .mockResolvedValueOnce({ status: "Failed" } as never); + + const promise = queryCircuitBreakerSnapshot( + mockCloudWatchClient, + ["/aws/lambda/test-https-client-perf-client-1"], + 0, + 60, + ); + + await jest.runAllTimersAsync(); + const result = await promise; + + expect(result).toBeNull(); + }); +}); + +describe("queryPerClientRateTimeline", () => { + it("returns empty array when StartQuery returns no queryId", async () => { + mockCloudWatchClient.send.mockResolvedValueOnce({} as never); + + const result = await queryPerClientRateTimeline( + mockCloudWatchClient, + "/aws/lambda/test-https-client-perf-client-1", + 0, + 60, + ); + + expect(result).toEqual([]); + }); + + it("returns empty array when the query status is Failed", async () => { + mockCloudWatchClient.send + .mockResolvedValueOnce({ queryId: "qid-pcr1" } as never) + .mockResolvedValueOnce({ status: "Failed" } as never); + + const promise = queryPerClientRateTimeline( + mockCloudWatchClient, + "/aws/lambda/test-https-client-perf-client-1", + 0, + 60, + ); + + await jest.runAllTimersAsync(); + const result = await promise; + + expect(result).toEqual([]); + }); + + it("returns empty array when results are empty", async () => { + mockCloudWatchClient.send + .mockResolvedValueOnce({ queryId: "qid-pcr2" } as never) + .mockResolvedValueOnce({ status: "Complete", results: [] } as never); + + const promise = queryPerClientRateTimeline( + mockCloudWatchClient, + "/aws/lambda/test-https-client-perf-client-1", + 0, + 60, + ); + + await jest.runAllTimersAsync(); + const result = await promise; + + expect(result).toEqual([]); + }); + + it("returns empty array when results is undefined", async () => { + mockCloudWatchClient.send + .mockResolvedValueOnce({ queryId: "qid-pcr2b" } as never) + .mockResolvedValueOnce({ status: "Complete" } as never); + + const promise = queryPerClientRateTimeline( + mockCloudWatchClient, + "/aws/lambda/test-https-client-perf-client-1", + 0, + 60, + ); + + await jest.runAllTimersAsync(); + const result = await promise; + + expect(result).toEqual([]); + }); + + it("defaults missing fields to zero", async () => { + mockCloudWatchClient.send + .mockResolvedValueOnce({ queryId: "qid-pcr2c" } as never) + .mockResolvedValueOnce({ + status: "Complete", + results: [[{ field: "unknownField", value: "123" }]], + } as never); + + const promise = queryPerClientRateTimeline( + mockCloudWatchClient, + "/aws/lambda/test-https-client-perf-client-1", + 0, + 60, + ); + + await jest.runAllTimersAsync(); + const result = await promise; + + expect(result).toHaveLength(1); + expect(result[0].deliveryAttempts).toBe(0); + expect(result[0].timestampSec).toBe( + Math.floor(new Date("0").getTime() / 1000), + ); + }); + + it("returns entries sorted by time bin when query completes", async () => { + mockCloudWatchClient.send + .mockResolvedValueOnce({ queryId: "qid-pcr3" } as never) + .mockResolvedValueOnce({ + status: "Complete", + results: [ + [ + { field: "timeBin", value: "2026-04-09 10:00:00.000" }, + { field: "deliveryAttempts", value: "42" }, + ], + [ + { field: "timeBin", value: "2026-04-09 10:00:10.000" }, + { field: "deliveryAttempts", value: "38" }, + ], + ], + } as never); + + const promise = queryPerClientRateTimeline( + mockCloudWatchClient, + "/aws/lambda/test-https-client-perf-client-1", + 0, + 60, + ); + + await jest.runAllTimersAsync(); + const result = await promise; + + expect(result).toHaveLength(2); + expect(result[0]).toEqual({ + timestampSec: Math.floor( + new Date("2026-04-09 10:00:00.000").getTime() / 1000, + ), + deliveryAttempts: 42, + }); + expect(result[1]).toEqual({ + timestampSec: Math.floor( + new Date("2026-04-09 10:00:10.000").getTime() / 1000, + ), + deliveryAttempts: 38, + }); + }); + + it("sends logGroupName to StartQuery", async () => { + mockCloudWatchClient.send + .mockResolvedValueOnce({ queryId: "qid-pcr4" } as never) + .mockResolvedValueOnce({ status: "Complete", results: [] } as never); + + const promise = queryPerClientRateTimeline( + mockCloudWatchClient, + "/aws/lambda/test-https-client-perf-client-1", + 100, + 200, + ); + + await jest.runAllTimersAsync(); + await promise; + + const startCmd = mockCloudWatchClient.send.mock.calls[0][0] as { + input: { logGroupName: string; startTime: number; endTime: number }; + }; + expect(startCmd.input.logGroupName).toBe( + "/aws/lambda/test-https-client-perf-client-1", + ); + expect(startCmd.input.startTime).toBe(100); + expect(startCmd.input.endTime).toBe(200); + }); + + it("polls until the query becomes Complete", async () => { + mockCloudWatchClient.send + .mockResolvedValueOnce({ queryId: "qid-pcr5" } as never) + .mockResolvedValueOnce({ status: "Running" } as never) + .mockResolvedValueOnce({ + status: "Complete", + results: [ + [ + { field: "timeBin", value: "2026-04-09 10:00:00.000" }, + { field: "deliveryAttempts", value: "5" }, + ], + ], + } as never); + + const promise = queryPerClientRateTimeline( + mockCloudWatchClient, + "/aws/lambda/test-https-client-perf-client-1", + 0, + 60, + ); + + await jest.runAllTimersAsync(); + const result = await promise; + + expect(result).toHaveLength(1); + expect(result[0].deliveryAttempts).toBe(5); + expect(mockCloudWatchClient.send).toHaveBeenCalledTimes(3); + }); + + it("returns empty array when the query does not complete within the timeout", async () => { + mockCloudWatchClient.send + .mockResolvedValueOnce({ queryId: "qid-pcr6" } as never) + .mockResolvedValue({ status: "Running" } as never); + + const promise = queryPerClientRateTimeline( + mockCloudWatchClient, + "/aws/lambda/test-https-client-perf-client-1", + 0, + 60, + ); + + await jest.advanceTimersByTimeAsync(60_000); + const result = await promise; + + expect(result).toEqual([]); + }); + + it("returns empty array when the query status is Cancelled", async () => { + mockCloudWatchClient.send + .mockResolvedValueOnce({ queryId: "qid-pcr7" } as never) + .mockResolvedValueOnce({ status: "Cancelled" } as never); + + const promise = queryPerClientRateTimeline( + mockCloudWatchClient, + "/aws/lambda/test-https-client-perf-client-1", + 0, + 60, + ); + + await jest.runAllTimersAsync(); + const result = await promise; + + expect(result).toEqual([]); + }); +}); diff --git a/lambdas/perf-runner-lambda/src/__tests__/elasticache.test.ts b/lambdas/perf-runner-lambda/src/__tests__/elasticache.test.ts new file mode 100644 index 00000000..09846ed3 --- /dev/null +++ b/lambdas/perf-runner-lambda/src/__tests__/elasticache.test.ts @@ -0,0 +1,74 @@ +import { flushElastiCache } from "elasticache"; +import type { ElastiCacheDeps } from "types"; + +const mockConnect = jest.fn().mockResolvedValue(undefined); +const mockFlushAll = jest.fn().mockResolvedValue("OK"); +const mockDisconnect = jest.fn().mockResolvedValue(undefined); +let mockIsOpen = true; + +jest.mock("@redis/client", () => ({ + createClient: jest.fn(() => ({ + connect: mockConnect, + flushAll: mockFlushAll, + disconnect: mockDisconnect, + get isOpen() { + return mockIsOpen; + }, + })), +})); + +jest.mock("@smithy/signature-v4", () => ({ + SignatureV4: jest.fn(() => ({ + presign: jest.fn().mockResolvedValue({ + query: { + "X-Amz-Algorithm": "AWS4-HMAC-SHA256", + "X-Amz-Credential": "test-credential", + }, + }), + })), +})); + +jest.mock("@aws-crypto/sha256-js", () => ({ + Sha256: jest.fn(), +})); + +jest.mock("@aws-sdk/credential-providers", () => ({ + fromNodeProviderChain: jest.fn(() => ({})), +})); + +const deps: ElastiCacheDeps = { + endpoint: "test-cache.example.invalid", + cacheName: "test-cache", + iamUsername: "test-user", + region: "eu-west-2", +}; + +beforeEach(() => { + jest.clearAllMocks(); + mockIsOpen = true; +}); + +describe("flushElastiCache", () => { + it("connects, flushes all keys, and disconnects", async () => { + await flushElastiCache(deps); + + expect(mockConnect).toHaveBeenCalledTimes(1); + expect(mockFlushAll).toHaveBeenCalledTimes(1); + expect(mockDisconnect).toHaveBeenCalledTimes(1); + }); + + it("disconnects even when flushAll throws", async () => { + mockFlushAll.mockRejectedValueOnce(new Error("flush failed")); + + await expect(flushElastiCache(deps)).rejects.toThrow("flush failed"); + expect(mockDisconnect).toHaveBeenCalledTimes(1); + }); + + it("skips disconnect when client is not open", async () => { + mockIsOpen = false; + + await flushElastiCache(deps); + + expect(mockDisconnect).not.toHaveBeenCalled(); + }); +}); diff --git a/lambdas/perf-runner-lambda/src/__tests__/index.test.ts b/lambdas/perf-runner-lambda/src/__tests__/index.test.ts index 1d1a501a..b1edc297 100644 --- a/lambdas/perf-runner-lambda/src/__tests__/index.test.ts +++ b/lambdas/perf-runner-lambda/src/__tests__/index.test.ts @@ -32,6 +32,7 @@ const mockResult: PerformanceResult = { phases: [], metrics: [], deliveryMetrics: [], + circuitBreakerMetrics: [], }; beforeEach(() => { @@ -41,7 +42,14 @@ beforeEach(() => { process.env.TRANSFORM_FILTER_LOG_GROUP = "/aws/lambda/nhs-dev-callbacks-client-transform-filter"; process.env.DELIVERY_LOG_GROUP_PREFIX = - "/aws/lambda/nhs-dev-callbacks-https-client-"; + "/aws/lambda/nhs-dev-cbc-https-client-"; + process.env.DELIVERY_QUEUE_URL_PREFIX = + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-cbc-"; + process.env.MOCK_WEBHOOK_LOG_GROUP = + "/aws/lambda/nhs-dev-callbacks-mock-webhook"; + process.env.ELASTICACHE_ENDPOINT = "cache.example.invalid"; + process.env.ELASTICACHE_CACHE_NAME = "test-cache"; + process.env.ELASTICACHE_IAM_USERNAME = "test-user"; process.env.AWS_REGION = "eu-west-2"; }); @@ -54,10 +62,20 @@ describe("handler", () => { expect.objectContaining({ queueUrl: "https://sqs.example.invalid/queue", logGroupName: "/aws/lambda/nhs-dev-callbacks-client-transform-filter", - deliveryLogGroupPrefix: "/aws/lambda/nhs-dev-callbacks-https-client-", + deliveryLogGroupPrefix: "/aws/lambda/nhs-dev-cbc-https-client-", + deliveryQueueUrlPrefix: + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-cbc-", + mockWebhookLogGroup: "/aws/lambda/nhs-dev-callbacks-mock-webhook", }), DEFAULT_SCENARIO, "test-id", + undefined, + expect.objectContaining({ + endpoint: "cache.example.invalid", + cacheName: "test-cache", + iamUsername: "test-user", + region: "eu-west-2", + }), ); }); @@ -73,6 +91,8 @@ describe("handler", () => { expect.anything(), customScenario, "custom-test", + undefined, + expect.anything(), ); }); @@ -117,6 +137,38 @@ describe("handler", () => { }), DEFAULT_SCENARIO, "no-prefix-test", + undefined, + expect.anything(), + ); + }); + + it("passes undefined elastiCacheDeps when ElastiCache env vars are missing", async () => { + delete process.env.ELASTICACHE_ENDPOINT; + delete process.env.ELASTICACHE_CACHE_NAME; + delete process.env.ELASTICACHE_IAM_USERNAME; + + await handler({ testId: "no-cache-test" }); + + expect(mockRunPerformanceTest).toHaveBeenCalledWith( + expect.anything(), + DEFAULT_SCENARIO, + "no-cache-test", + undefined, + undefined, + ); + }); + + it("passes mockWebhookLogGroup from env var", async () => { + await handler({ testId: "webhook-test" }); + + expect(mockRunPerformanceTest).toHaveBeenCalledWith( + expect.objectContaining({ + mockWebhookLogGroup: "/aws/lambda/nhs-dev-callbacks-mock-webhook", + }), + expect.anything(), + "webhook-test", + undefined, + expect.anything(), ); }); }); diff --git a/lambdas/perf-runner-lambda/src/__tests__/purge.test.ts b/lambdas/perf-runner-lambda/src/__tests__/purge.test.ts new file mode 100644 index 00000000..016779ea --- /dev/null +++ b/lambdas/perf-runner-lambda/src/__tests__/purge.test.ts @@ -0,0 +1,140 @@ +import type { SQSClient } from "@aws-sdk/client-sqs"; +import { deriveQueueUrls, purgeQueues } from "purge"; +import type { Scenario } from "types"; + +const scenario: Scenario = { + phases: [{ durationSecs: 1, targetEps: 10 }], + eventMix: [ + { + weight: 1, + factory: "messageStatus", + clientId: "perf-client-1", + messageStatus: "DELIVERED", + }, + { + weight: 1, + factory: "channelStatus", + clientId: "perf-client-2", + channelStatus: "DELIVERED", + }, + ], + metricsIntervalSecs: 5, +}; + +const inboundQueueUrl = + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-inbound-event-queue"; + +const deliveryQueueUrlPrefix = + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-cbc-"; + +describe("deriveQueueUrls", () => { + it("derives all queue URLs from the inbound queue URL, scenario and delivery prefix", () => { + const urls = deriveQueueUrls( + inboundQueueUrl, + scenario, + deliveryQueueUrlPrefix, + ); + + expect(urls).toEqual([ + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-inbound-event-queue", + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-inbound-event-dlq-queue", + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-cbc-perf-client-1-delivery-queue", + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-cbc-perf-client-1-delivery-dlq-queue", + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-cbc-perf-client-2-delivery-queue", + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-cbc-perf-client-2-delivery-dlq-queue", + ]); + }); + + it("deduplicates client IDs that appear multiple times in eventMix", () => { + const duplicateScenario: Scenario = { + ...scenario, + eventMix: [ + { + weight: 1, + factory: "messageStatus", + clientId: "perf-client-1", + messageStatus: "DELIVERED", + }, + { + weight: 1, + factory: "channelStatus", + clientId: "perf-client-1", + channelStatus: "DELIVERED", + }, + ], + }; + + const urls = deriveQueueUrls( + inboundQueueUrl, + duplicateScenario, + deliveryQueueUrlPrefix, + ); + + expect(urls).toEqual([ + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-inbound-event-queue", + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-inbound-event-dlq-queue", + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-cbc-perf-client-1-delivery-queue", + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-cbc-perf-client-1-delivery-dlq-queue", + ]); + }); + + it("falls back to inbound URL prefix when no delivery prefix is given", () => { + const urls = deriveQueueUrls(inboundQueueUrl, scenario); + + expect(urls).toEqual([ + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-inbound-event-queue", + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-inbound-event-dlq-queue", + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-perf-client-1-delivery-queue", + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-perf-client-1-delivery-dlq-queue", + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-perf-client-2-delivery-queue", + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-callbacks-perf-client-2-delivery-dlq-queue", + ]); + }); +}); + +describe("purgeQueues", () => { + const mockSend = jest.fn().mockResolvedValue({}); + const mockSqsClient = { send: mockSend } as unknown as SQSClient; + + beforeEach(() => { + jest.clearAllMocks(); + mockSend.mockResolvedValue({}); + }); + + it("sends a PurgeQueueCommand for each queue URL", async () => { + const urls = [ + "https://sqs.example.invalid/queue-a", + "https://sqs.example.invalid/queue-b", + ]; + + await purgeQueues(mockSqsClient, urls); + + expect(mockSend).toHaveBeenCalledTimes(2); + }); + + it("ignores NonExistentQueue errors gracefully", async () => { + const nonExistentError = Object.assign(new Error("Queue does not exist"), { + name: "AWS.SimpleQueueService.NonExistentQueue", + }); + mockSend.mockRejectedValueOnce(nonExistentError); + + await expect( + purgeQueues(mockSqsClient, ["https://sqs.example.invalid/missing"]), + ).resolves.toBeUndefined(); + }); + + it("rethrows non-NonExistentQueue errors", async () => { + const otherError = new Error("Access denied"); + mockSend.mockRejectedValueOnce(otherError); + + await expect( + purgeQueues(mockSqsClient, ["https://sqs.example.invalid/queue"]), + ).rejects.toThrow("Access denied"); + }); + + it("handles an empty queue URL list without sending any commands", async () => { + await purgeQueues(mockSqsClient, []); + + expect(mockSend).not.toHaveBeenCalled(); + }); +}); diff --git a/lambdas/perf-runner-lambda/src/__tests__/runner.test.ts b/lambdas/perf-runner-lambda/src/__tests__/runner.test.ts index 1cf5f3a3..23c720de 100644 --- a/lambdas/perf-runner-lambda/src/__tests__/runner.test.ts +++ b/lambdas/perf-runner-lambda/src/__tests__/runner.test.ts @@ -1,6 +1,7 @@ import type { SQSClient } from "@aws-sdk/client-sqs"; import type { CloudWatchLogsClient } from "@aws-sdk/client-cloudwatch-logs"; import type { + CircuitBreakerSnapshot, DeliveryMetricsSnapshot, MetricsSnapshot, PhaseResult, @@ -10,16 +11,35 @@ import type { import { defaultSleep, runPerformanceTest } from "runner"; import { generatePhaseLoad } from "sqs"; -import { queryDeliveryMetricsSnapshot, queryMetricsSnapshot } from "cloudwatch"; +import { deriveQueueUrls, purgeQueues } from "purge"; +import { flushElastiCache } from "elasticache"; +import { verifyMockWebhook } from "webhook-verify"; +import { + queryCircuitBreakerSnapshot, + queryDeliveryMetricsSnapshot, + queryMetricsSnapshot, + queryPerClientRateTimeline, +} from "cloudwatch"; jest.mock("sqs"); jest.mock("cloudwatch"); +jest.mock("purge"); +jest.mock("elasticache"); +jest.mock("webhook-verify"); const mockGeneratePhaseLoad = jest.mocked(generatePhaseLoad); const mockQueryMetricsSnapshot = jest.mocked(queryMetricsSnapshot); const mockQueryDeliveryMetricsSnapshot = jest.mocked( queryDeliveryMetricsSnapshot, ); +const mockQueryCircuitBreakerSnapshot = jest.mocked( + queryCircuitBreakerSnapshot, +); +const mockQueryPerClientRateTimeline = jest.mocked(queryPerClientRateTimeline); +const mockDeriveQueueUrls = jest.mocked(deriveQueueUrls); +const mockPurgeQueues = jest.mocked(purgeQueues); +const mockFlushElastiCache = jest.mocked(flushElastiCache); +const mockVerifyMockWebhook = jest.mocked(verifyMockWebhook); const immediateSleep = jest.fn().mockResolvedValue(undefined); @@ -46,6 +66,20 @@ const mockDeliverySnapshot: DeliveryMetricsSnapshot = { p99Ms: 500, }; +const mockCbSnapshot: CircuitBreakerSnapshot = { + snapshotAt: Date.now(), + intervalStartSec: 0, + intervalEndSec: 60, + circuitOpenEvents: 1, + circuitCloseEvents: 0, + admissionDeniedCircuitOpen: 5, + admissionDeniedRateLimited: 3, + deliveryAttempts: 100, + deliverySuccesses: 92, + deliveryFailures: 5, + deliveryRateLimited: 3, +}; + const scenario: Scenario = { phases: [{ durationSecs: 1, targetEps: 1000 }], eventMix: [ @@ -64,13 +98,26 @@ const deps: RunnerDeps = { cloudWatchClient: {} as CloudWatchLogsClient, queueUrl: "https://sqs.example.invalid/queue", logGroupName: "/aws/lambda/nhs-dev-callbacks-client-transform-filter", - deliveryLogGroupPrefix: "/aws/lambda/nhs-dev-callbacks-https-client-", + deliveryLogGroupPrefix: "/aws/lambda/nhs-dev-cbc-https-client-", + deliveryQueueUrlPrefix: + "https://sqs.eu-west-2.amazonaws.com/123456789/nhs-dev-cbc-", }; beforeEach(() => { jest.clearAllMocks(); mockGeneratePhaseLoad.mockResolvedValue(mockPhaseResult); mockQueryDeliveryMetricsSnapshot.mockResolvedValue(null); + mockQueryCircuitBreakerSnapshot.mockResolvedValue(null); + mockQueryPerClientRateTimeline.mockResolvedValue([]); + mockDeriveQueueUrls.mockReturnValue([ + "https://sqs.example.invalid/inbound-event-queue", + ]); + mockPurgeQueues.mockResolvedValue(undefined); + mockFlushElastiCache.mockResolvedValue(undefined); + mockVerifyMockWebhook.mockResolvedValue({ + receivedCallbacks: 0, + verified: false, + }); immediateSleep.mockResolvedValue(undefined); }); @@ -78,6 +125,7 @@ describe("runPerformanceTest", () => { it("returns a PerformanceResult with phase results and snapshots from polling and final query", async () => { mockQueryMetricsSnapshot.mockResolvedValue(mockSnapshot); mockQueryDeliveryMetricsSnapshot.mockResolvedValue(mockDeliverySnapshot); + mockQueryCircuitBreakerSnapshot.mockResolvedValue(mockCbSnapshot); const result = await runPerformanceTest( deps, @@ -92,6 +140,7 @@ describe("runPerformanceTest", () => { expect(result.phases[0]).toEqual(mockPhaseResult); expect(result.metrics).toHaveLength(2); // one mid-test, one final expect(result.deliveryMetrics).toHaveLength(2); // one mid-test, one final + expect(result.circuitBreakerMetrics).toHaveLength(2); // one mid-test, one final expect(result.startedAt).toBeTruthy(); expect(result.completedAt).toBeTruthy(); }); @@ -111,6 +160,7 @@ describe("runPerformanceTest", () => { expect(result.metrics).toHaveLength(1); expect(result.metrics[0]).toEqual(mockSnapshot); expect(result.deliveryMetrics).toHaveLength(0); + expect(result.circuitBreakerMetrics).toHaveLength(0); }); it("produces an empty metrics array when all queries return null", async () => { @@ -125,6 +175,7 @@ describe("runPerformanceTest", () => { expect(result.metrics).toHaveLength(0); expect(result.deliveryMetrics).toHaveLength(0); + expect(result.circuitBreakerMetrics).toHaveLength(0); }); it("runs all phases and collects each result", async () => { @@ -267,7 +318,9 @@ describe("runPerformanceTest", () => { ); expect(mockQueryDeliveryMetricsSnapshot).not.toHaveBeenCalled(); + expect(mockQueryCircuitBreakerSnapshot).not.toHaveBeenCalled(); expect(result.deliveryMetrics).toHaveLength(0); + expect(result.circuitBreakerMetrics).toHaveLength(0); }); it("builds delivery log group names from prefix and event mix client IDs", async () => { @@ -302,13 +355,284 @@ describe("runPerformanceTest", () => { expect(mockQueryDeliveryMetricsSnapshot).toHaveBeenCalledWith( deps.cloudWatchClient, expect.arrayContaining([ - "/aws/lambda/nhs-dev-callbacks-https-client-perf-client-1", - "/aws/lambda/nhs-dev-callbacks-https-client-perf-client-2", + "/aws/lambda/nhs-dev-cbc-https-client-perf-client-1", + "/aws/lambda/nhs-dev-cbc-https-client-perf-client-2", ]), expect.any(Number), expect.any(Number), ); }); + + it("collects circuit breaker metrics when deliveryLogGroupPrefix is set", async () => { + mockQueryMetricsSnapshot.mockResolvedValue(mockSnapshot); + mockQueryDeliveryMetricsSnapshot.mockResolvedValue(mockDeliverySnapshot); + mockQueryCircuitBreakerSnapshot.mockResolvedValue(mockCbSnapshot); + + const result = await runPerformanceTest( + deps, + scenario, + "test-cb-1", + immediateSleep, + ); + + expect(result.circuitBreakerMetrics.length).toBeGreaterThanOrEqual(1); + expect(mockQueryCircuitBreakerSnapshot).toHaveBeenCalled(); + }); + + it("returns empty circuitBreakerMetrics when CB queries return null", async () => { + mockQueryMetricsSnapshot.mockResolvedValue(mockSnapshot); + mockQueryDeliveryMetricsSnapshot.mockResolvedValue(mockDeliverySnapshot); + mockQueryCircuitBreakerSnapshot.mockResolvedValue(null); + + const result = await runPerformanceTest( + deps, + scenario, + "test-cb-null", + immediateSleep, + ); + + expect(result.circuitBreakerMetrics).toHaveLength(0); + }); + + it("uses per-interval windowing for circuit breaker snapshots", async () => { + mockQueryMetricsSnapshot.mockResolvedValue(mockSnapshot); + mockQueryDeliveryMetricsSnapshot.mockResolvedValue(mockDeliverySnapshot); + mockQueryCircuitBreakerSnapshot.mockResolvedValue(mockCbSnapshot); + + let resolvePhase!: (value: PhaseResult) => void; + mockGeneratePhaseLoad.mockImplementation( + () => + new Promise((r) => { + resolvePhase = r; + }), + ); + + let sleepCount = 0; + const controlledSleep = jest.fn(async () => { + sleepCount += 1; + if (sleepCount >= 3) { + resolvePhase(mockPhaseResult); + } + }); + + await runPerformanceTest( + deps, + scenario, + "test-cb-interval", + controlledSleep, + ); + + const cbCalls = mockQueryCircuitBreakerSnapshot.mock.calls; + expect(cbCalls.length).toBeGreaterThanOrEqual(2); + const firstCallEndSec = cbCalls[0][3]; + const secondCallStartSec = cbCalls[1][2]; + expect(secondCallStartSec).toBe(firstCallEndSec); + }); + + it("collects per-client rate timelines when deliveryLogGroupPrefix is set", async () => { + mockQueryMetricsSnapshot.mockResolvedValue(mockSnapshot); + mockQueryDeliveryMetricsSnapshot.mockResolvedValue(mockDeliverySnapshot); + mockQueryPerClientRateTimeline.mockResolvedValue([ + { timestampSec: 1000, deliveryAttempts: 10 }, + ]); + + const result = await runPerformanceTest( + deps, + scenario, + "test-pcr-1", + immediateSleep, + ); + + expect(result.perClientRateTimelines).toHaveLength(1); + expect(result.perClientRateTimelines![0].clientId).toBe("perf-client-1"); + expect(result.perClientRateTimelines![0].entries).toHaveLength(1); + }); + + it("queries each client log group individually for rate timelines", async () => { + mockQueryMetricsSnapshot.mockResolvedValue(null); + mockQueryPerClientRateTimeline.mockResolvedValue([ + { timestampSec: 1000, deliveryAttempts: 5 }, + ]); + + const multiClientScenario: Scenario = { + ...scenario, + eventMix: [ + { + weight: 1, + factory: "messageStatus", + clientId: "perf-client-1", + messageStatus: "DELIVERED", + }, + { + weight: 1, + factory: "channelStatus", + clientId: "perf-client-2", + channelStatus: "DELIVERED", + }, + ], + }; + + const result = await runPerformanceTest( + deps, + multiClientScenario, + "test-pcr-multi", + immediateSleep, + ); + + expect(mockQueryPerClientRateTimeline).toHaveBeenCalledTimes(2); + expect(mockQueryPerClientRateTimeline).toHaveBeenCalledWith( + deps.cloudWatchClient, + "/aws/lambda/nhs-dev-cbc-https-client-perf-client-1", + expect.any(Number), + expect.any(Number), + ); + expect(mockQueryPerClientRateTimeline).toHaveBeenCalledWith( + deps.cloudWatchClient, + "/aws/lambda/nhs-dev-cbc-https-client-perf-client-2", + expect.any(Number), + expect.any(Number), + ); + expect(result.perClientRateTimelines).toHaveLength(2); + }); + + it("excludes clients with empty rate timelines", async () => { + mockQueryMetricsSnapshot.mockResolvedValue(null); + mockQueryPerClientRateTimeline + .mockResolvedValueOnce([{ timestampSec: 1000, deliveryAttempts: 5 }]) + .mockResolvedValueOnce([]); + + const multiClientScenario: Scenario = { + ...scenario, + eventMix: [ + { + weight: 1, + factory: "messageStatus", + clientId: "perf-client-1", + messageStatus: "DELIVERED", + }, + { + weight: 1, + factory: "channelStatus", + clientId: "perf-client-2", + channelStatus: "DELIVERED", + }, + ], + }; + + const result = await runPerformanceTest( + deps, + multiClientScenario, + "test-pcr-filter", + immediateSleep, + ); + + expect(result.perClientRateTimelines).toHaveLength(1); + expect(result.perClientRateTimelines![0].clientId).toBe("perf-client-1"); + }); + + it("skips per-client rate timelines when deliveryLogGroupPrefix is undefined", async () => { + const depsWithoutPrefix: RunnerDeps = { + ...deps, + deliveryLogGroupPrefix: undefined, + }; + mockQueryMetricsSnapshot.mockResolvedValue(mockSnapshot); + + const result = await runPerformanceTest( + depsWithoutPrefix, + scenario, + "test-pcr-skip", + immediateSleep, + ); + + expect(mockQueryPerClientRateTimeline).not.toHaveBeenCalled(); + expect(result.perClientRateTimelines).toHaveLength(0); + }); + + it("purges queues before and after the test run", async () => { + mockQueryMetricsSnapshot.mockResolvedValue(null); + + await runPerformanceTest(deps, scenario, "test-purge", immediateSleep); + + expect(mockDeriveQueueUrls).toHaveBeenCalledWith( + deps.queueUrl, + scenario, + deps.deliveryQueueUrlPrefix, + ); + expect(mockPurgeQueues).toHaveBeenCalledTimes(2); + }); + + it("flushes ElastiCache before and after when deps are provided", async () => { + mockQueryMetricsSnapshot.mockResolvedValue(null); + const elastiCacheDeps = { + endpoint: "cache.example.invalid", + cacheName: "test-cache", + iamUsername: "test-user", + region: "eu-west-2", + }; + + await runPerformanceTest( + deps, + scenario, + "test-flush", + immediateSleep, + elastiCacheDeps, + ); + + expect(mockFlushElastiCache).toHaveBeenCalledTimes(2); + expect(mockFlushElastiCache).toHaveBeenCalledWith(elastiCacheDeps); + }); + + it("skips ElastiCache flush when deps are not provided", async () => { + mockQueryMetricsSnapshot.mockResolvedValue(null); + + await runPerformanceTest(deps, scenario, "test-no-flush", immediateSleep); + + expect(mockFlushElastiCache).not.toHaveBeenCalled(); + }); + + it("verifies mock webhook when log group is configured", async () => { + const depsWithWebhook: RunnerDeps = { + ...deps, + mockWebhookLogGroup: "/aws/lambda/test-mock-webhook", + }; + mockQueryMetricsSnapshot.mockResolvedValue(null); + mockVerifyMockWebhook.mockResolvedValue({ + receivedCallbacks: 25, + verified: true, + }); + + const result = await runPerformanceTest( + depsWithWebhook, + scenario, + "test-webhook", + immediateSleep, + ); + + expect(mockVerifyMockWebhook).toHaveBeenCalledWith( + depsWithWebhook.cloudWatchClient, + "/aws/lambda/test-mock-webhook", + expect.any(Number), + expect.any(Number), + ); + expect(result.webhookVerification).toEqual({ + receivedCallbacks: 25, + verified: true, + }); + }); + + it("omits webhook verification when log group is not configured", async () => { + mockQueryMetricsSnapshot.mockResolvedValue(null); + + const result = await runPerformanceTest( + deps, + scenario, + "test-no-webhook", + immediateSleep, + ); + + expect(mockVerifyMockWebhook).not.toHaveBeenCalled(); + expect(result.webhookVerification).toBeUndefined(); + }); }); describe("defaultSleep", () => { diff --git a/lambdas/perf-runner-lambda/src/__tests__/webhook-verify.test.ts b/lambdas/perf-runner-lambda/src/__tests__/webhook-verify.test.ts new file mode 100644 index 00000000..72c49870 --- /dev/null +++ b/lambdas/perf-runner-lambda/src/__tests__/webhook-verify.test.ts @@ -0,0 +1,173 @@ +import type { CloudWatchLogsClient } from "@aws-sdk/client-cloudwatch-logs"; +import { verifyMockWebhook } from "webhook-verify"; + +const mockSend = jest.fn(); +const mockClient = { send: mockSend } as unknown as CloudWatchLogsClient; + +beforeEach(() => { + jest.clearAllMocks(); +}); + +describe("verifyMockWebhook", () => { + it("returns verified=true when callbacks are found", async () => { + mockSend.mockResolvedValueOnce({ queryId: "q-1" }).mockResolvedValueOnce({ + status: "Complete", + results: [[{ field: "callbackCount", value: "42" }]], + }); + + const result = await verifyMockWebhook( + mockClient, + "/aws/lambda/test-mock-webhook", + 1000, + 2000, + ); + + expect(result).toEqual({ receivedCallbacks: 42, verified: true }); + }); + + it("returns verified=false when no callbacks are found", async () => { + mockSend.mockResolvedValueOnce({ queryId: "q-2" }).mockResolvedValueOnce({ + status: "Complete", + results: [[{ field: "callbackCount", value: "0" }]], + }); + + const result = await verifyMockWebhook( + mockClient, + "/aws/lambda/test-mock-webhook", + 1000, + 2000, + ); + + expect(result).toEqual({ receivedCallbacks: 0, verified: false }); + }); + + it("returns verified=false when query fails", async () => { + mockSend + .mockResolvedValueOnce({ queryId: "q-3" }) + .mockResolvedValueOnce({ status: "Failed" }); + + const result = await verifyMockWebhook( + mockClient, + "/aws/lambda/test-mock-webhook", + 1000, + 2000, + ); + + expect(result).toEqual({ receivedCallbacks: 0, verified: false }); + }); + + it("returns verified=false when no queryId is returned", async () => { + mockSend.mockResolvedValueOnce({}); + + const result = await verifyMockWebhook( + mockClient, + "/aws/lambda/test-mock-webhook", + 1000, + 2000, + ); + + expect(result).toEqual({ receivedCallbacks: 0, verified: false }); + }); + + it("returns verified=false when results are empty", async () => { + mockSend.mockResolvedValueOnce({ queryId: "q-4" }).mockResolvedValueOnce({ + status: "Complete", + results: [], + }); + + const result = await verifyMockWebhook( + mockClient, + "/aws/lambda/test-mock-webhook", + 1000, + 2000, + ); + + expect(result).toEqual({ receivedCallbacks: 0, verified: false }); + }); + + it("returns verified=false when results field is undefined", async () => { + mockSend.mockResolvedValueOnce({ queryId: "q-4b" }).mockResolvedValueOnce({ + status: "Complete", + results: undefined, + }); + + const result = await verifyMockWebhook( + mockClient, + "/aws/lambda/test-mock-webhook", + 1000, + 2000, + ); + + expect(result).toEqual({ receivedCallbacks: 0, verified: false }); + }); + + it("polls until the query completes", async () => { + mockSend + .mockResolvedValueOnce({ queryId: "q-5" }) + .mockResolvedValueOnce({ status: "Running" }) + .mockResolvedValueOnce({ + status: "Complete", + results: [[{ field: "callbackCount", value: "10" }]], + }); + + const result = await verifyMockWebhook( + mockClient, + "/aws/lambda/test-mock-webhook", + 1000, + 2000, + ); + + expect(result).toEqual({ receivedCallbacks: 10, verified: true }); + expect(mockSend).toHaveBeenCalledTimes(3); + }); + + it("returns verified=false when query is cancelled", async () => { + mockSend + .mockResolvedValueOnce({ queryId: "q-6" }) + .mockResolvedValueOnce({ status: "Cancelled" }); + + const result = await verifyMockWebhook( + mockClient, + "/aws/lambda/test-mock-webhook", + 1000, + 2000, + ); + + expect(result).toEqual({ receivedCallbacks: 0, verified: false }); + }); + + it("returns verified=false when polling times out", async () => { + jest.useFakeTimers(); + + mockSend.mockResolvedValueOnce({ queryId: "q-7" }).mockImplementation( + () => + new Promise((resolve) => { + setTimeout(() => resolve({ status: "Running" }), 1000); + }), + ); + + const originalDateNow = Date.now; + let callCount = 0; + jest.spyOn(Date, "now").mockImplementation(() => { + callCount += 1; + if (callCount <= 1) return originalDateNow.call(Date); + return originalDateNow.call(Date) + 60_000; + }); + + const promise = verifyMockWebhook( + mockClient, + "/aws/lambda/test-mock-webhook", + 1000, + 2000, + ); + + await jest.advanceTimersByTimeAsync(60_000); + + const result = await promise; + + expect(result).toEqual({ receivedCallbacks: 0, verified: false }); + + jest.useRealTimers(); + jest.restoreAllMocks(); + }); +}); diff --git a/lambdas/perf-runner-lambda/src/cloudwatch.ts b/lambdas/perf-runner-lambda/src/cloudwatch.ts index 206bec33..598f5f3f 100644 --- a/lambdas/perf-runner-lambda/src/cloudwatch.ts +++ b/lambdas/perf-runner-lambda/src/cloudwatch.ts @@ -3,19 +3,22 @@ import { GetQueryResultsCommand, StartQueryCommand, } from "@aws-sdk/client-cloudwatch-logs"; -import type { DeliveryMetricsSnapshot, MetricsSnapshot } from "types"; +import type { + CircuitBreakerSnapshot, + DeliveryMetricsSnapshot, + MetricsSnapshot, + PerClientRateEntry, +} from "types"; const INSIGHTS_POLL_INTERVAL_MS = 2000; const INSIGHTS_TIMEOUT_MS = 30_000; type ResultField = { field?: string; value?: string }; -async function pollQueryResults( +async function pollInsightsQuery( client: CloudWatchLogsClient, queryId: string, - mapRow: (row: ResultField[]) => T, -): Promise { - const zeroResult = mapRow([]); +): Promise { const deadline = Date.now() + INSIGHTS_TIMEOUT_MS; while (Date.now() < deadline) { @@ -30,15 +33,33 @@ async function pollQueryResults( } if (response.status === "Complete") { - const row = response.results?.[0]; - if (!row) return zeroResult; - return mapRow(row); + return (response.results as ResultField[][]) ?? []; } } return null; } +async function pollQueryResults( + client: CloudWatchLogsClient, + queryId: string, + mapRow: (row: ResultField[]) => T, +): Promise { + const rows = await pollInsightsQuery(client, queryId); + if (rows === null) return null; + return mapRow(rows[0] ?? []); +} + +async function pollAllQueryResults( + client: CloudWatchLogsClient, + queryId: string, + mapRow: (row: ResultField[]) => T, +): Promise { + const rows = await pollInsightsQuery(client, queryId); + if (rows === null) return []; + return rows.map((row) => mapRow(row)); +} + export async function queryMetricsSnapshot( client: CloudWatchLogsClient, logGroupName: string, @@ -108,3 +129,86 @@ export async function queryDeliveryMetricsSnapshot( }; }); } + +export async function queryCircuitBreakerSnapshot( + client: CloudWatchLogsClient, + logGroupNames: string[], + startTimeSec: number, + endTimeSec: number, +): Promise { + if (logGroupNames.length === 0) return null; + + const { queryId } = await client.send( + new StartQueryCommand({ + logGroupNames, + startTime: startTimeSec, + endTime: endTimeSec, + queryString: [ + 'filter msg in ["Circuit breaker opened", "Circuit breaker closed", "Admission denied", "Attempting delivery", "Delivery succeeded", "Transient delivery failure \u2014 requeuing", "Permanent delivery failure \u2014 sending to DLQ", "Rate limited (429)"]', + '| stats sum(msg = "Circuit breaker opened") as circuitOpenEvents,', + ' sum(msg = "Circuit breaker closed") as circuitCloseEvents,', + ' sum(msg = "Admission denied" and reason = "circuit_open") as admissionDeniedCircuitOpen,', + ' sum(msg = "Admission denied" and reason = "rate_limited") as admissionDeniedRateLimited,', + ' sum(msg = "Attempting delivery") as deliveryAttempts,', + ' sum(msg = "Delivery succeeded") as deliverySuccesses,', + ' sum(msg in ["Transient delivery failure \u2014 requeuing", "Permanent delivery failure \u2014 sending to DLQ"]) as deliveryFailures,', + ' sum(msg = "Rate limited (429)") as deliveryRateLimited', + ].join("\n"), + }), + ); + + if (!queryId) return null; + + return pollQueryResults(client, queryId, (row) => { + const getField = (name: string): number => + Number(row.find((f) => f.field === name)?.value ?? 0); + + return { + snapshotAt: Date.now(), + intervalStartSec: startTimeSec, + intervalEndSec: endTimeSec, + circuitOpenEvents: getField("circuitOpenEvents"), + circuitCloseEvents: getField("circuitCloseEvents"), + admissionDeniedCircuitOpen: getField("admissionDeniedCircuitOpen"), + admissionDeniedRateLimited: getField("admissionDeniedRateLimited"), + deliveryAttempts: getField("deliveryAttempts"), + deliverySuccesses: getField("deliverySuccesses"), + deliveryFailures: getField("deliveryFailures"), + deliveryRateLimited: getField("deliveryRateLimited"), + }; + }); +} + +const RATE_TIMELINE_BIN_SECONDS = 10; + +export async function queryPerClientRateTimeline( + client: CloudWatchLogsClient, + logGroupName: string, + startTimeSec: number, + endTimeSec: number, +): Promise { + const { queryId } = await client.send( + new StartQueryCommand({ + logGroupName, + startTime: startTimeSec, + endTime: endTimeSec, + queryString: [ + 'filter msg in ["Attempting delivery", "Admission denied"]', + `| stats sum(msg = "Attempting delivery") as deliveryAttempts by bin(@timestamp, ${RATE_TIMELINE_BIN_SECONDS}s) as timeBin`, + "| sort timeBin asc", + ].join("\n"), + }), + ); + + if (!queryId) return []; + + return pollAllQueryResults(client, queryId, (row) => { + const timeBinStr = row.find((f) => f.field === "timeBin")?.value ?? "0"; + const timestampSec = Math.floor(new Date(timeBinStr).getTime() / 1000); + const deliveryAttempts = Number( + row.find((f) => f.field === "deliveryAttempts")?.value ?? 0, + ); + + return { timestampSec, deliveryAttempts }; + }); +} diff --git a/lambdas/perf-runner-lambda/src/elasticache.ts b/lambdas/perf-runner-lambda/src/elasticache.ts new file mode 100644 index 00000000..8d0b86c6 --- /dev/null +++ b/lambdas/perf-runner-lambda/src/elasticache.ts @@ -0,0 +1,52 @@ +import { type RedisClientType, createClient } from "@redis/client"; +import { SignatureV4 } from "@smithy/signature-v4"; +import { Sha256 } from "@aws-crypto/sha256-js"; +import { fromNodeProviderChain } from "@aws-sdk/credential-providers"; +import type { ElastiCacheDeps } from "types"; + +const TOKEN_EXPIRY_SECONDS = 900; + +async function generateIamToken(deps: ElastiCacheDeps): Promise { + const signer = new SignatureV4({ + credentials: fromNodeProviderChain(), + region: deps.region, + service: "elasticache", + sha256: Sha256, + }); + + const signed = await signer.presign( + { + protocol: "https:", + method: "GET", + hostname: deps.cacheName, + path: "/", + query: { Action: "connect", User: deps.iamUsername }, + headers: { host: deps.cacheName }, + }, + { expiresIn: TOKEN_EXPIRY_SECONDS }, + ); + + const qs = new URLSearchParams( + signed.query as Record, + ).toString(); + return `${deps.cacheName}/?${qs}`; +} + +export async function flushElastiCache(deps: ElastiCacheDeps): Promise { + const token = await generateIamToken(deps); + + const client: RedisClientType = createClient({ + url: `rediss://${deps.endpoint}:6379`, + username: deps.iamUsername, + password: token, + }); + + try { + await client.connect(); + await client.flushAll(); + } finally { + if (client.isOpen) { + await client.disconnect(); + } + } +} diff --git a/lambdas/perf-runner-lambda/src/index.ts b/lambdas/perf-runner-lambda/src/index.ts index a0881866..f201f1ea 100644 --- a/lambdas/perf-runner-lambda/src/index.ts +++ b/lambdas/perf-runner-lambda/src/index.ts @@ -3,7 +3,11 @@ import { SQSClient } from "@aws-sdk/client-sqs"; import { Logger } from "@nhs-notify-client-callbacks/logger"; import { runPerformanceTest } from "runner"; import { DEFAULT_SCENARIO } from "scenario"; -import type { PerfRunnerPayload, PerformanceResult } from "types"; +import type { + ElastiCacheDeps, + PerfRunnerPayload, + PerformanceResult, +} from "types"; const logger = new Logger(); @@ -16,6 +20,11 @@ export async function handler( const queueUrl = process.env.INBOUND_QUEUE_URL; const logGroupName = process.env.TRANSFORM_FILTER_LOG_GROUP; const deliveryLogGroupPrefix = process.env.DELIVERY_LOG_GROUP_PREFIX; + const deliveryQueueUrlPrefix = process.env.DELIVERY_QUEUE_URL_PREFIX; + const mockWebhookLogGroup = process.env.MOCK_WEBHOOK_LOG_GROUP; + const elasticacheEndpoint = process.env.ELASTICACHE_ENDPOINT; + const elasticacheCacheName = process.env.ELASTICACHE_CACHE_NAME; + const elasticacheIamUsername = process.env.ELASTICACHE_IAM_USERNAME; if (!queueUrl) { throw new Error("Missing required environment variable: INBOUND_QUEUE_URL"); @@ -30,6 +39,16 @@ export async function handler( const sqsClient = new SQSClient({ region }); const cloudWatchClient = new CloudWatchLogsClient({ region }); + const elastiCacheDeps: ElastiCacheDeps | undefined = + elasticacheEndpoint && elasticacheCacheName && elasticacheIamUsername + ? { + endpoint: elasticacheEndpoint, + cacheName: elasticacheCacheName, + iamUsername: elasticacheIamUsername, + region, + } + : undefined; + logger.info("Performance test started", { testId }); try { @@ -40,9 +59,13 @@ export async function handler( queueUrl, logGroupName, deliveryLogGroupPrefix, + deliveryQueueUrlPrefix, + mockWebhookLogGroup, }, scenario, testId, + undefined, + elastiCacheDeps, ); logger.info("Performance test completed", { testId }); diff --git a/lambdas/perf-runner-lambda/src/purge.ts b/lambdas/perf-runner-lambda/src/purge.ts new file mode 100644 index 00000000..bd51097b --- /dev/null +++ b/lambdas/perf-runner-lambda/src/purge.ts @@ -0,0 +1,42 @@ +import { PurgeQueueCommand, type SQSClient } from "@aws-sdk/client-sqs"; +import type { Scenario } from "types"; + +export function deriveQueueUrls( + inboundQueueUrl: string, + scenario: Scenario, + deliveryQueueUrlPrefix?: string, +): string[] { + // eslint-disable-next-line sonarjs/null-dereference -- String.replace always returns a string + const inboundBaseUrl = inboundQueueUrl.replace(/inbound-event-queue$/, ""); + const deliveryBaseUrl = deliveryQueueUrlPrefix ?? inboundBaseUrl; + const clientIds = [...new Set(scenario.eventMix.map((e) => e.clientId))]; + + return [ + inboundQueueUrl, + `${inboundBaseUrl}inbound-event-dlq-queue`, + ...clientIds.flatMap((id) => [ + `${deliveryBaseUrl}${id}-delivery-queue`, + `${deliveryBaseUrl}${id}-delivery-dlq-queue`, + ]), + ]; +} + +export async function purgeQueues( + client: SQSClient, + queueUrls: string[], +): Promise { + const results = await Promise.allSettled( + queueUrls.map((url) => + client.send(new PurgeQueueCommand({ QueueUrl: url })), + ), + ); + + for (const result of results) { + if (result.status === "rejected") { + const error = result.reason as { name?: string }; + if (error.name !== "AWS.SimpleQueueService.NonExistentQueue") { + throw result.reason as Error; + } + } + } +} diff --git a/lambdas/perf-runner-lambda/src/runner.ts b/lambdas/perf-runner-lambda/src/runner.ts index a265e90e..0ae6a81c 100644 --- a/lambdas/perf-runner-lambda/src/runner.ts +++ b/lambdas/perf-runner-lambda/src/runner.ts @@ -1,13 +1,25 @@ import type { + CircuitBreakerSnapshot, DeliveryMetricsSnapshot, + ElastiCacheDeps, MetricsSnapshot, + PerClientRateTimeline, PerformanceResult, PhaseResult, RunnerDeps, Scenario, + WebhookVerificationResult, } from "types"; import { generatePhaseLoad } from "sqs"; -import { queryDeliveryMetricsSnapshot, queryMetricsSnapshot } from "cloudwatch"; +import { deriveQueueUrls, purgeQueues } from "purge"; +import { flushElastiCache } from "elasticache"; +import { verifyMockWebhook } from "webhook-verify"; +import { + queryCircuitBreakerSnapshot, + queryDeliveryMetricsSnapshot, + queryMetricsSnapshot, + queryPerClientRateTimeline, +} from "cloudwatch"; const CLOUDWATCH_SETTLING_MS = 60_000; @@ -25,11 +37,56 @@ function buildDeliveryLogGroupNames( return [...clientIds].map((id) => `${prefix}${id}`); } +async function collectSnapshots( + deps: RunnerDeps, + deliveryLogGroupNames: string[], + startSec: number, + endSec: number, + cbStartSec: number, + out: { + snapshots: MetricsSnapshot[]; + deliverySnapshots: DeliveryMetricsSnapshot[]; + cbSnapshots: CircuitBreakerSnapshot[]; + }, +): Promise { + const snap = await queryMetricsSnapshot( + deps.cloudWatchClient, + deps.logGroupName, + startSec, + endSec, + ); + if (snap !== null) out.snapshots.push(snap); + + if (deliveryLogGroupNames.length > 0) { + const deliverySnap = await queryDeliveryMetricsSnapshot( + deps.cloudWatchClient, + deliveryLogGroupNames, + startSec, + endSec, + ); + if (deliverySnap !== null) out.deliverySnapshots.push(deliverySnap); + + const cbSnap = await queryCircuitBreakerSnapshot( + deps.cloudWatchClient, + deliveryLogGroupNames, + cbStartSec, + endSec, + ); + if (cbSnap !== null) { + out.cbSnapshots.push(cbSnap); + return endSec; + } + } + + return cbStartSec; +} + export async function runPerformanceTest( deps: RunnerDeps, scenario: Scenario, testId: string, sleepFn: (ms: number) => Promise = defaultSleep, + elastiCacheDeps?: ElastiCacheDeps, ): Promise { if (scenario.eventMix.length === 0) { throw new Error("scenario.eventMix must contain at least one entry"); @@ -49,10 +106,23 @@ export async function runPerformanceTest( } const testStartMs = Date.now(); + + const queueUrls = deriveQueueUrls( + deps.queueUrl, + scenario, + deps.deliveryQueueUrlPrefix, + ); + await purgeQueues(deps.sqsClient, queueUrls); + if (elastiCacheDeps) { + await flushElastiCache(elastiCacheDeps); + } + const startedAt = new Date(testStartMs).toISOString(); const phaseResults: PhaseResult[] = []; const snapshots: MetricsSnapshot[] = []; const deliverySnapshots: DeliveryMetricsSnapshot[] = []; + const cbSnapshots: CircuitBreakerSnapshot[] = []; + let lastCbSnapshotSec = Math.floor(testStartMs / 1000); let stopPolling = false; const deliveryLogGroupNames = buildDeliveryLogGroupNames( @@ -60,29 +130,22 @@ export async function runPerformanceTest( scenario, ); + const out = { snapshots, deliverySnapshots, cbSnapshots }; + const pollLoop = async (): Promise => { await sleepFn(scenario.metricsIntervalSecs * 1000); while (!stopPolling) { const startSec = Math.floor(testStartMs / 1000); const endSec = Math.floor(Date.now() / 1000); - const snap = await queryMetricsSnapshot( - deps.cloudWatchClient, - deps.logGroupName, + lastCbSnapshotSec = await collectSnapshots( + deps, + deliveryLogGroupNames, startSec, endSec, + lastCbSnapshotSec, + out, ); - if (snap !== null) snapshots.push(snap); - - if (deliveryLogGroupNames.length > 0) { - const deliverySnap = await queryDeliveryMetricsSnapshot( - deps.cloudWatchClient, - deliveryLogGroupNames, - startSec, - endSec, - ); - if (deliverySnap !== null) deliverySnapshots.push(deliverySnap); - } if (!stopPolling) { await sleepFn(scenario.metricsIntervalSecs * 1000); @@ -110,22 +173,48 @@ export async function runPerformanceTest( const finalStartSec = Math.floor(testStartMs / 1000); const finalEndSec = Math.floor(Date.now() / 1000); - const finalSnap = await queryMetricsSnapshot( - deps.cloudWatchClient, - deps.logGroupName, + await collectSnapshots( + deps, + deliveryLogGroupNames, finalStartSec, finalEndSec, + lastCbSnapshotSec, + out, ); - if (finalSnap !== null) snapshots.push(finalSnap); - if (deliveryLogGroupNames.length > 0) { - const finalDeliverySnap = await queryDeliveryMetricsSnapshot( + const perClientRateTimelines: PerClientRateTimeline[] = []; + + if (deps.deliveryLogGroupPrefix) { + const clientIds = [...new Set(scenario.eventMix.map((e) => e.clientId))]; + const timelinePromises = clientIds.map(async (clientId) => { + const logGroupName = `${deps.deliveryLogGroupPrefix}${clientId}`; + const entries = await queryPerClientRateTimeline( + deps.cloudWatchClient, + logGroupName, + finalStartSec, + finalEndSec, + ); + return { clientId, entries }; + }); + const timelines = await Promise.all(timelinePromises); + perClientRateTimelines.push( + ...timelines.filter((t) => t.entries.length > 0), + ); + } + + let webhookVerification: WebhookVerificationResult | undefined; + if (deps.mockWebhookLogGroup) { + webhookVerification = await verifyMockWebhook( deps.cloudWatchClient, - deliveryLogGroupNames, + deps.mockWebhookLogGroup, finalStartSec, finalEndSec, ); - if (finalDeliverySnap !== null) deliverySnapshots.push(finalDeliverySnap); + } + + await purgeQueues(deps.sqsClient, queueUrls); + if (elastiCacheDeps) { + await flushElastiCache(elastiCacheDeps); } return { @@ -136,5 +225,8 @@ export async function runPerformanceTest( phases: phaseResults, metrics: snapshots, deliveryMetrics: deliverySnapshots, + circuitBreakerMetrics: cbSnapshots, + perClientRateTimelines, + webhookVerification, }; } diff --git a/lambdas/perf-runner-lambda/src/types.ts b/lambdas/perf-runner-lambda/src/types.ts index 5366602d..b4ece4ae 100644 --- a/lambdas/perf-runner-lambda/src/types.ts +++ b/lambdas/perf-runner-lambda/src/types.ts @@ -55,6 +55,35 @@ export type DeliveryMetricsSnapshot = { p99Ms: number; }; +export type CircuitBreakerSnapshot = { + snapshotAt: number; + intervalStartSec: number; + intervalEndSec: number; + circuitOpenEvents: number; + circuitCloseEvents: number; + admissionDeniedCircuitOpen: number; + admissionDeniedRateLimited: number; + deliveryAttempts: number; + deliverySuccesses: number; + deliveryFailures: number; + deliveryRateLimited: number; +}; + +export type PerClientRateEntry = { + timestampSec: number; + deliveryAttempts: number; +}; + +export type PerClientRateTimeline = { + clientId: string; + entries: PerClientRateEntry[]; +}; + +export type WebhookVerificationResult = { + receivedCallbacks: number; + verified: boolean; +}; + export type PerformanceResult = { testId: string; scenario: Scenario; @@ -63,6 +92,9 @@ export type PerformanceResult = { phases: PhaseResult[]; metrics: MetricsSnapshot[]; deliveryMetrics: DeliveryMetricsSnapshot[]; + circuitBreakerMetrics: CircuitBreakerSnapshot[]; + perClientRateTimelines?: PerClientRateTimeline[]; + webhookVerification?: WebhookVerificationResult; }; export type PerfRunnerPayload = { @@ -76,4 +108,13 @@ export type RunnerDeps = { queueUrl: string; logGroupName: string; deliveryLogGroupPrefix?: string; + deliveryQueueUrlPrefix?: string; + mockWebhookLogGroup?: string; +}; + +export type ElastiCacheDeps = { + endpoint: string; + cacheName: string; + iamUsername: string; + region: string; }; diff --git a/lambdas/perf-runner-lambda/src/webhook-verify.ts b/lambdas/perf-runner-lambda/src/webhook-verify.ts new file mode 100644 index 00000000..77c1fa6d --- /dev/null +++ b/lambdas/perf-runner-lambda/src/webhook-verify.ts @@ -0,0 +1,59 @@ +import { + type CloudWatchLogsClient, + GetQueryResultsCommand, + StartQueryCommand, +} from "@aws-sdk/client-cloudwatch-logs"; +import type { WebhookVerificationResult } from "types"; + +const INSIGHTS_POLL_INTERVAL_MS = 2000; +const INSIGHTS_TIMEOUT_MS = 30_000; + +export async function verifyMockWebhook( + client: CloudWatchLogsClient, + logGroupName: string, + startTimeSec: number, + endTimeSec: number, +): Promise { + const { queryId } = await client.send( + new StartQueryCommand({ + logGroupName, + startTime: startTimeSec, + endTime: endTimeSec, + queryString: [ + 'filter msg = "Callback received"', + "| stats count(*) as callbackCount", + ].join("\n"), + }), + ); + + if (!queryId) { + return { receivedCallbacks: 0, verified: false }; + } + + const deadline = Date.now() + INSIGHTS_TIMEOUT_MS; + + while (Date.now() < deadline) { + await new Promise((resolve) => { + setTimeout(resolve, INSIGHTS_POLL_INTERVAL_MS); + }); + + const response = await client.send(new GetQueryResultsCommand({ queryId })); + + if (response.status === "Failed" || response.status === "Cancelled") { + return { receivedCallbacks: 0, verified: false }; + } + + if (response.status === "Complete") { + const rows = + (response.results as { field?: string; value?: string }[][]) ?? []; + const row = rows[0] ?? []; + const count = Number( + row.find((f) => f.field === "callbackCount")?.value ?? 0, + ); + + return { receivedCallbacks: count, verified: count > 0 }; + } + } + + return { receivedCallbacks: 0, verified: false }; +} diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index c497eafb..f2b2aa3a 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -450,21 +450,30 @@ importers: lambdas/perf-runner-lambda: dependencies: + '@aws-crypto/sha256-js': + specifier: catalog:aws + version: 5.2.0 '@aws-sdk/client-cloudwatch-logs': specifier: catalog:aws version: 3.1026.0 '@aws-sdk/client-sqs': specifier: catalog:aws version: 3.1026.0 + '@aws-sdk/credential-providers': + specifier: catalog:aws + version: 3.1026.0 '@nhs-notify-client-callbacks/logger': specifier: workspace:* version: link:../../src/logger '@nhs-notify-client-callbacks/models': specifier: workspace:* version: link:../../src/models - esbuild: - specifier: catalog:tools - version: 0.28.0 + '@redis/client': + specifier: catalog:app + version: 1.6.1 + '@smithy/signature-v4': + specifier: catalog:aws + version: 5.3.13 devDependencies: '@tsconfig/node22': specifier: catalog:tools @@ -478,6 +487,9 @@ importers: '@types/node': specifier: catalog:tools version: 25.6.0 + esbuild: + specifier: catalog:tools + version: 0.28.0 eslint: specifier: catalog:lint version: 9.39.4(jiti@2.6.1) diff --git a/scripts/tests/integration-debug.sh b/scripts/tests/integration-debug.sh index a4ebbd63..f375be1a 100755 --- a/scripts/tests/integration-debug.sh +++ b/scripts/tests/integration-debug.sh @@ -59,6 +59,7 @@ fi ACCOUNT_ID="$(aws sts get-caller-identity --profile "$AWS_PROFILE" --query Account --output text)" PREFIX="nhs-${ENVIRONMENT}-callbacks" +CLIENT_PREFIX="nhs-${ENVIRONMENT}-cbc" PIPE_NAME="${PREFIX}-main" print_section() { @@ -97,8 +98,8 @@ show_queue_counts() { action_queue_status() { require_client_id - show_queue_counts "Client Delivery Queue - Message Counts" "${PREFIX}-${CLIENT_ID}-delivery-queue" - show_queue_counts "Client Delivery DLQ - Message Counts" "${PREFIX}-${CLIENT_ID}-delivery-dlq-queue" + show_queue_counts "Client Delivery Queue - Message Counts" "${CLIENT_PREFIX}-${CLIENT_ID}-delivery-queue" + show_queue_counts "Client Delivery DLQ - Message Counts" "${CLIENT_PREFIX}-${CLIENT_ID}-delivery-dlq-queue" show_queue_counts "Inbound Event Queue - Message Counts" "${PREFIX}-inbound-event-queue" show_queue_counts "Inbound Event DLQ - Message Counts" "${PREFIX}-inbound-event-dlq" } @@ -126,8 +127,8 @@ peek_queue_message() { action_queue_peek() { require_client_id - peek_queue_message "Client Delivery Queue - Message Peek" "${PREFIX}-${CLIENT_ID}-delivery-queue" - peek_queue_message "Client Delivery DLQ - Message Peek" "${PREFIX}-${CLIENT_ID}-delivery-dlq-queue" + peek_queue_message "Client Delivery Queue - Message Peek" "${CLIENT_PREFIX}-${CLIENT_ID}-delivery-queue" + peek_queue_message "Client Delivery DLQ - Message Peek" "${CLIENT_PREFIX}-${CLIENT_ID}-delivery-dlq-queue" peek_queue_message "Inbound Event Queue - Message Peek" "${PREFIX}-inbound-event-queue" peek_queue_message "Inbound Event DLQ - Message Peek" "${PREFIX}-inbound-event-dlq" } @@ -162,7 +163,7 @@ action_tail_https_client() { print_section "HTTPS Client Lambda Logs" aws logs tail \ - "/aws/lambda/${PREFIX}-https-client-${CLIENT_ID}" \ + "/aws/lambda/${CLIENT_PREFIX}-https-client-${CLIENT_ID}" \ --region "$REGION" \ --profile "$AWS_PROFILE" \ --since 30m \ diff --git a/tests/integration/delivery-resilience.test.ts b/tests/integration/delivery-resilience.test.ts index 8b218233..8fa510e5 100644 --- a/tests/integration/delivery-resilience.test.ts +++ b/tests/integration/delivery-resilience.test.ts @@ -117,7 +117,7 @@ describe("Delivery Resilience", () => { const { clientId } = getClientConfig("clientRateLimit"); dlqUrl = ctx.clientDlqUrl(clientId); deliveryUrl = ctx.clientDeliveryUrl(clientId); - httpsClientLogGroup = ctx.logGroup(`https-client-${clientId}`); + httpsClientLogGroup = ctx.clientLogGroup(`https-client-${clientId}`); await purgeQueues(ctx.sqs, [dlqUrl, deliveryUrl]); }); @@ -190,7 +190,7 @@ describe("Delivery Resilience", () => { const { clientId } = getClientConfig("clientCircuitBreaker"); dlqUrl = ctx.clientDlqUrl(clientId); deliveryUrl = ctx.clientDeliveryUrl(clientId); - httpsClientLogGroup = ctx.logGroup(`https-client-${clientId}`); + httpsClientLogGroup = ctx.clientLogGroup(`https-client-${clientId}`); await purgeQueues(ctx.sqs, [dlqUrl, deliveryUrl]); }); diff --git a/tests/integration/dlq-alarms.test.ts b/tests/integration/dlq-alarms.test.ts index c4f69fa8..ae1e1bff 100644 --- a/tests/integration/dlq-alarms.test.ts +++ b/tests/integration/dlq-alarms.test.ts @@ -13,10 +13,10 @@ import { import { buildMockClientDlqQueueUrl } from "./helpers/sqs"; function buildDlqDepthAlarmName( - { component, environment, project }: DeploymentDetails, + { clientComponent, environment, project }: DeploymentDetails, clientId: string, ): string { - return `${project}-${environment}-${component}-${clientId}-dlq-depth`; + return `${project}-${environment}-${clientComponent}-${clientId}-dlq-depth`; } function getQueueNameFromUrl(queueUrl: string): string { diff --git a/tests/integration/helpers/sqs.ts b/tests/integration/helpers/sqs.ts index 5cdcc3a9..2f35b7e5 100644 --- a/tests/integration/helpers/sqs.ts +++ b/tests/integration/helpers/sqs.ts @@ -49,14 +49,20 @@ export function buildMockClientDlqQueueUrl( deploymentDetails: DeploymentDetails, clientId: string, ): string { - return buildQueueUrl(deploymentDetails, `${clientId}-delivery-dlq`); + return buildQueueUrl( + { ...deploymentDetails, component: deploymentDetails.clientComponent }, + `${clientId}-delivery-dlq`, + ); } export function buildMockClientDeliveryQueueUrl( deploymentDetails: DeploymentDetails, clientId: string, ): string { - return buildQueueUrl(deploymentDetails, `${clientId}-delivery`); + return buildQueueUrl( + { ...deploymentDetails, component: deploymentDetails.clientComponent }, + `${clientId}-delivery`, + ); } export async function sendSqsEvent( diff --git a/tests/integration/helpers/test-context.ts b/tests/integration/helpers/test-context.ts index df5a31f5..c55ebdf6 100644 --- a/tests/integration/helpers/test-context.ts +++ b/tests/integration/helpers/test-context.ts @@ -25,10 +25,15 @@ export type TestContext = { clientDlqUrl(clientId: string): string; clientDeliveryUrl(clientId: string): string; logGroup(name: string): string; + clientLogGroup(name: string): string; }; export function createTestContext(): TestContext { const deployment = getDeploymentDetails(); + const clientDeployment = { + ...deployment, + component: deployment.clientComponent, + }; return { sqs: createSqsClient(deployment), @@ -43,6 +48,7 @@ export function createTestContext(): TestContext { clientDeliveryUrl: (clientId) => buildMockClientDeliveryQueueUrl(deployment, clientId), logGroup: (name) => buildLambdaLogGroupName(deployment, name), + clientLogGroup: (name) => buildLambdaLogGroupName(clientDeployment, name), }; } diff --git a/tests/integration/metrics.test.ts b/tests/integration/metrics.test.ts index 20e1dfb8..cd99588b 100644 --- a/tests/integration/metrics.test.ts +++ b/tests/integration/metrics.test.ts @@ -118,7 +118,7 @@ describe("Metrics", () => { beforeAll(() => { const { clientId } = getClientConfig("clientSingleTarget"); - httpsClientLogGroup = ctx.logGroup(`https-client-${clientId}`); + httpsClientLogGroup = ctx.clientLogGroup(`https-client-${clientId}`); }); it("should emit DeliveryAttempt, DeliverySuccess and DeliveryDurationMs on successful delivery", async () => { diff --git a/tests/test-support/helpers/deployment.ts b/tests/test-support/helpers/deployment.ts index 20bf1f59..9a548f70 100644 --- a/tests/test-support/helpers/deployment.ts +++ b/tests/test-support/helpers/deployment.ts @@ -3,6 +3,7 @@ export type DeploymentDetails = { environment: string; project: string; component: string; + clientComponent: string; accountId: string; }; @@ -11,6 +12,7 @@ export function getDeploymentDetails(): DeploymentDetails { const environment = process.env.ENVIRONMENT; const project = process.env.PROJECT ?? "nhs"; const component = process.env.COMPONENT ?? "callbacks"; + const clientComponent = process.env.CLIENT_COMPONENT ?? "cbc"; const accountId = process.env.AWS_ACCOUNT_ID; if (!environment) { @@ -21,7 +23,14 @@ export function getDeploymentDetails(): DeploymentDetails { throw new Error("AWS_ACCOUNT_ID environment variable must be set"); } - return { region, environment, project, component, accountId }; + return { + region, + environment, + project, + component, + clientComponent, + accountId, + }; } export function buildSubscriptionConfigBucketName({