diff --git a/.github/workflows/validate.yml b/.github/workflows/validate.yml index 014f051..2f2032b 100644 --- a/.github/workflows/validate.yml +++ b/.github/workflows/validate.yml @@ -33,6 +33,9 @@ jobs: - name: Terraform validate run: terraform -chdir=grantstack-backend/terraform validate + - name: Terraform tests + run: terraform -chdir=grantstack-backend/terraform test + - name: Set up Python uses: actions/setup-python@v5 with: diff --git a/README.md b/README.md index 97b4f5f..9e2379d 100644 --- a/README.md +++ b/README.md @@ -181,6 +181,7 @@ This repo is designed to show practical full-stack cloud engineering judgment: - `grantstack-backend/lambda/` - Python Lambda handlers. - `grantstack-backend/scripts/smoke_test.py` - public API workflow smoke test. - `grantstack-backend/docs/DLQ_REPLAY_RUNBOOK.md` - dead-letter queue recovery runbook. +- `grantstack-backend/docs/SQS_WORKER_RUNBOOK.md` - intentional dev worker operation and empty-receive monitoring. - `grantstack-landing/` - Cloudflare Pages static site. ## Backend Deploy @@ -228,6 +229,7 @@ python3 -m py_compile grantstack-backend/scripts/sync_vector_index.py python3 -m unittest discover -s grantstack-backend/tests terraform -chdir=grantstack-backend/terraform fmt -check -recursive terraform -chdir=grantstack-backend/terraform validate +terraform -chdir=grantstack-backend/terraform test grantstack-backend/scripts/sync_vector_index.py --dry-run --limit 3 grantstack-backend/scripts/smoke_test.py --timeout 180 --interval 5 ``` @@ -309,7 +311,7 @@ The infrastructure is designed for near-zero idle cost: - API Gateway HTTP API charges only by request. - API Gateway stage throttling reduces accidental or abusive request bursts. - Lambda charges only during execution. -- SQS charges by request. +- SQS charges by request. The dev Lambda poller is disabled by default; set `enable_sqs_worker = true` and apply only when dev queue processing is intentional. - DynamoDB uses `PAY_PER_REQUEST`. - DynamoDB TTL is enabled on `expires_at` so pilot project and analytics data can age out automatically. - CloudWatch Logs, dashboards, alarms, X-Ray traces, S3 catalog storage, and remote Terraform state may create small charges as usage grows. diff --git a/grantstack-backend/ARCHITECTURE.md b/grantstack-backend/ARCHITECTURE.md index d72fd85..74c4b16 100644 --- a/grantstack-backend/ARCHITECTURE.md +++ b/grantstack-backend/ARCHITECTURE.md @@ -14,6 +14,8 @@ GrantStack uses a fully asynchronous, decoupled serverless workflow designed for 8. A scheduled Source Refresh Lambda verifies official source URLs, records retrieval status and content hashes, and writes the active catalog to S3. 9. The frontend posts privacy-light product events to `POST /analytics`; the Analytics Lambda validates and stores them in a separate on-demand DynamoDB table with TTL. +The SQS event source mapping is disabled by default in `dev` and requires `enable_sqs_worker = true` for intentional processing. Staging and production remain enabled by default. Messages sent while the dev mapping is disabled remain queued until processing is enabled. + ```mermaid flowchart LR Client["Client"] --> Api["API Gateway HTTP API"] @@ -37,7 +39,7 @@ flowchart LR - API Gateway HTTP API has no provisioned gateway capacity. - Lambda uses no provisioned concurrency and is not attached to a VPC, avoiding NAT gateway idle cost. -- SQS charges per request and stores messages only while work exists. +- SQS charges per request. Dev disables the Lambda event source mapping by default so managed pollers do not issue empty receives while idle. - DynamoDB uses `PAY_PER_REQUEST`; there is no provisioned read/write capacity. - The analytics table also uses `PAY_PER_REQUEST` and TTL, preserving the same zero-idle compute posture for product telemetry. - CloudWatch log groups use explicit retention. diff --git a/grantstack-backend/docs/DLQ_REPLAY_RUNBOOK.md b/grantstack-backend/docs/DLQ_REPLAY_RUNBOOK.md index b07066b..8e397f5 100644 --- a/grantstack-backend/docs/DLQ_REPLAY_RUNBOOK.md +++ b/grantstack-backend/docs/DLQ_REPLAY_RUNBOOK.md @@ -23,6 +23,7 @@ aws sqs receive-message \ --attribute-names All \ --message-attribute-names All \ --max-number-of-messages 1 \ + --wait-time-seconds 20 \ --visibility-timeout 30 ``` diff --git a/grantstack-backend/docs/SQS_WORKER_RUNBOOK.md b/grantstack-backend/docs/SQS_WORKER_RUNBOOK.md new file mode 100644 index 0000000..a462077 --- /dev/null +++ b/grantstack-backend/docs/SQS_WORKER_RUNBOOK.md @@ -0,0 +1,50 @@ +# SQS Worker Runbook + +GrantStack consumes `grantstack--processing` through an AWS Lambda event source mapping. There is no application-side polling loop. Lambda owns the `ReceiveMessage` calls, long polling, scaling, and idle backoff behavior. + +The processing queue uses a 20-second receive wait. The 960-second visibility timeout exceeds the processor Lambda's 900-second timeout. The mapping keeps a batch size of one because a single report can use most of that processing window; increasing the batch would risk messages becoming visible before the batch completes. Lambda deletes successfully processed messages and the handler reports individual failures through `ReportBatchItemFailures`. + +## Dev Worker Lifecycle + +The event source mapping is disabled by default when `environment = "dev"`. This stops idle SQS receives without deleting the queue, DLQ, or pending messages. Staging and production remain enabled by default. + +`enable_sqs_worker` is a deployment-time control because the Lambda service poller exists outside the function process. Automation can pass it as `TF_VAR_enable_sqs_worker`; a checked-in or CLI variable value takes normal Terraform precedence. + +Enable dev processing intentionally: + +```sh +terraform -chdir=grantstack-backend/terraform plan \ + -var-file=env/dev.tfvars \ + -var='enable_sqs_worker=true' \ + -out=grantstack-dev-worker.tfplan +terraform -chdir=grantstack-backend/terraform apply grantstack-dev-worker.tfplan +``` + +After the dev work is complete, set `enable_sqs_worker = false` in the dev variable file and apply again. `terraform output -raw sqs_worker_enabled` reports the deployed intent. Processor cold invocations log the queue name, 20-second wait, Lambda-managed idle behavior, and received batch size. + +Lambda event source mappings do not expose a configurable exponential idle-backoff setting. AWS manages their pollers and can keep multiple long polls active even when a queue is empty. Disabling the dev mapping is therefore the reliable zero-idle-request control; adding sleep or jitter inside the Lambda handler would not affect the separate AWS-managed pollers. + +## Verify Empty Receives + +In CloudWatch Metrics, select `AWS/SQS`, `Queue Metrics`, `NumberOfEmptyReceives`, and the `QueueName` dimension. Use a one-day period, the `Sum` statistic, and the deployment date through the current date. The operations dashboard also includes this metric at one-minute resolution. + +CLI example for UTC dates: + +```sh +aws cloudwatch get-metric-statistics \ + --namespace AWS/SQS \ + --metric-name NumberOfEmptyReceives \ + --dimensions Name=QueueName,Value=grantstack-dev-processing \ + --statistics Sum \ + --period 86400 \ + --start-time 2026-06-01T00:00:00Z \ + --end-time 2026-07-01T00:00:00Z +``` + +After disabling the mapping, confirm that daily sums fall to zero after any in-flight polls finish. Also verify the mapping state: + +```sh +aws lambda list-event-source-mappings \ + --function-name grantstack-dev-processor \ + --event-source-arn "$(terraform -chdir=grantstack-backend/terraform output -raw processing_queue_arn)" +``` diff --git a/grantstack-backend/lambda/processor_handler.py b/grantstack-backend/lambda/processor_handler.py index f073022..dccc210 100644 --- a/grantstack-backend/lambda/processor_handler.py +++ b/grantstack-backend/lambda/processor_handler.py @@ -59,8 +59,17 @@ def lambda_handler(event: Mapping[str, Any], context: Any) -> Dict[str, List[Dic config = load_config() table = dynamodb.Table(config.table_name) failed_items: List[Dict[str, str]] = [] + records = event.get("Records", []) + + logger.info( + "sqs_worker_started queue=%s wait_time_seconds=%s idle_backoff=%s batch_size=%s", + os.environ.get("SQS_QUEUE_NAME", "unknown"), + os.environ.get("SQS_RECEIVE_WAIT_TIME_SECS", "20"), + os.environ.get("SQS_IDLE_BACKOFF_MODE", "aws-lambda-managed"), + len(records), + ) - for record in event.get("Records", []): + for record in records: message_id = record.get("messageId", "unknown-message") try: asyncio.run(process_record(record, table, config)) diff --git a/grantstack-backend/terraform/env/dev.tfvars.example b/grantstack-backend/terraform/env/dev.tfvars.example index 4d48ba0..e9f3058 100644 --- a/grantstack-backend/terraform/env/dev.tfvars.example +++ b/grantstack-backend/terraform/env/dev.tfvars.example @@ -38,6 +38,7 @@ enable_xray_tracing = true enable_api_access_logs = true enable_cloudwatch_dashboard = true enable_source_refresh = true +enable_sqs_worker = false processor_memory_size = 1024 processor_timeout_seconds = 900 diff --git a/grantstack-backend/terraform/env/prod.tfvars.example b/grantstack-backend/terraform/env/prod.tfvars.example index 2786997..57962b0 100644 --- a/grantstack-backend/terraform/env/prod.tfvars.example +++ b/grantstack-backend/terraform/env/prod.tfvars.example @@ -40,6 +40,7 @@ enable_xray_tracing = true enable_api_access_logs = true enable_cloudwatch_dashboard = true enable_source_refresh = true +enable_sqs_worker = true processor_memory_size = 1024 processor_timeout_seconds = 900 diff --git a/grantstack-backend/terraform/env/staging.tfvars.example b/grantstack-backend/terraform/env/staging.tfvars.example index 267ba5a..e4b22ec 100644 --- a/grantstack-backend/terraform/env/staging.tfvars.example +++ b/grantstack-backend/terraform/env/staging.tfvars.example @@ -40,6 +40,7 @@ enable_xray_tracing = true enable_api_access_logs = true enable_cloudwatch_dashboard = true enable_source_refresh = true +enable_sqs_worker = true processor_memory_size = 1024 processor_timeout_seconds = 900 diff --git a/grantstack-backend/terraform/main.tf b/grantstack-backend/terraform/main.tf index e8b9062..c17c843 100644 --- a/grantstack-backend/terraform/main.tf +++ b/grantstack-backend/terraform/main.tf @@ -22,10 +22,12 @@ provider "aws" { data "aws_caller_identity" "current" {} locals { - name_prefix = "${var.project_name}-${var.environment}" - dynamodb_table_name = coalesce(var.dynamodb_table_name, "${local.name_prefix}-projects") - analytics_table_name = coalesce(var.analytics_table_name, "${local.name_prefix}-analytics") - source_catalog_bucket_name = coalesce(var.source_catalog_bucket_name, "${local.name_prefix}-source-catalog-${data.aws_caller_identity.current.account_id}") + name_prefix = "${var.project_name}-${var.environment}" + dynamodb_table_name = coalesce(var.dynamodb_table_name, "${local.name_prefix}-projects") + analytics_table_name = coalesce(var.analytics_table_name, "${local.name_prefix}-analytics") + source_catalog_bucket_name = coalesce(var.source_catalog_bucket_name, "${local.name_prefix}-source-catalog-${data.aws_caller_identity.current.account_id}") + sqs_receive_wait_time_seconds = 20 + sqs_worker_enabled = var.enable_sqs_worker != null ? var.enable_sqs_worker : var.environment != "dev" external_secret_arns = compact([ var.vector_db_api_key_secret_arn, var.embedding_api_key_secret_arn, @@ -111,6 +113,7 @@ data "archive_file" "source_refresh_lambda" { resource "aws_sqs_queue" "processing_dlq" { name = "${local.name_prefix}-processing-dlq" message_retention_seconds = 1209600 + receive_wait_time_seconds = local.sqs_receive_wait_time_seconds sqs_managed_sse_enabled = true tags = local.common_tags @@ -120,7 +123,7 @@ resource "aws_sqs_queue" "processing" { name = "${local.name_prefix}-processing" visibility_timeout_seconds = var.sqs_visibility_timeout_seconds message_retention_seconds = var.sqs_message_retention_seconds - receive_wait_time_seconds = 20 + receive_wait_time_seconds = local.sqs_receive_wait_time_seconds sqs_managed_sse_enabled = true redrive_policy = jsonencode({ @@ -701,6 +704,9 @@ resource "aws_lambda_function" "processor" { LLM_MODEL = var.llm_model MOCK_EXTERNAL_CALLS = tostring(var.mock_external_calls) HTTP_CLIENT_TIMEOUT_SECS = tostring(var.http_client_timeout_seconds) + SQS_QUEUE_NAME = aws_sqs_queue.processing.name + SQS_RECEIVE_WAIT_TIME_SECS = tostring(local.sqs_receive_wait_time_seconds) + SQS_IDLE_BACKOFF_MODE = "aws-lambda-managed" } } @@ -814,12 +820,13 @@ resource "aws_lambda_function" "source_refresh" { } resource "aws_lambda_event_source_mapping" "processor_sqs" { - event_source_arn = aws_sqs_queue.processing.arn - function_name = aws_lambda_function.processor.arn + event_source_arn = aws_sqs_queue.processing.arn + function_name = aws_lambda_function.processor.arn + # One report can use the full processor timeout, so larger batches risk visibility expiry. batch_size = 1 maximum_batching_window_in_seconds = 0 function_response_types = ["ReportBatchItemFailures"] - enabled = true + enabled = local.sqs_worker_enabled } resource "aws_cloudwatch_event_rule" "source_refresh" { @@ -1262,6 +1269,7 @@ resource "aws_cloudwatch_dashboard" "grantstack" { metrics = [ ["AWS/SQS", "ApproximateNumberOfMessagesVisible", "QueueName", aws_sqs_queue.processing.name, { stat = "Maximum", label = "Processing visible" }], [".", "ApproximateAgeOfOldestMessage", ".", aws_sqs_queue.processing.name, { stat = "Maximum", label = "Oldest age" }], + [".", "NumberOfEmptyReceives", ".", aws_sqs_queue.processing.name, { stat = "Sum", label = "Empty receives" }], [".", "ApproximateNumberOfMessagesVisible", ".", aws_sqs_queue.processing_dlq.name, { stat = "Maximum", label = "DLQ visible" }] ] period = 60 @@ -1367,6 +1375,11 @@ output "processing_queue_arn" { value = aws_sqs_queue.processing.arn } +output "sqs_worker_enabled" { + description = "Whether the Lambda SQS event source mapping is actively polling the processing queue." + value = local.sqs_worker_enabled +} + output "processing_dlq_url" { description = "SQS processing DLQ URL." value = aws_sqs_queue.processing_dlq.url diff --git a/grantstack-backend/terraform/terraform.tfvars.example b/grantstack-backend/terraform/terraform.tfvars.example index 8d368d7..158e6ee 100644 --- a/grantstack-backend/terraform/terraform.tfvars.example +++ b/grantstack-backend/terraform/terraform.tfvars.example @@ -45,6 +45,7 @@ enable_xray_tracing = true enable_api_access_logs = true enable_cloudwatch_dashboard = true enable_source_refresh = true +enable_sqs_worker = false processor_memory_size = 1024 processor_timeout_seconds = 900 diff --git a/grantstack-backend/terraform/tests/sqs_worker.tftest.hcl b/grantstack-backend/terraform/tests/sqs_worker.tftest.hcl new file mode 100644 index 0000000..bec0ed2 --- /dev/null +++ b/grantstack-backend/terraform/tests/sqs_worker.tftest.hcl @@ -0,0 +1,64 @@ +mock_provider "aws" { + mock_data "aws_caller_identity" { + defaults = { + account_id = "123456789012" + } + } + + mock_data "aws_iam_policy_document" { + defaults = { + json = "{\"Version\":\"2012-10-17\",\"Statement\":[]}" + } + } +} +mock_provider "archive" {} + +run "dev_worker_is_disabled_by_default" { + command = plan + + variables { + environment = "dev" + } + + assert { + condition = aws_lambda_event_source_mapping.processor_sqs.enabled == false + error_message = "The dev SQS worker must not poll unless it is explicitly enabled." + } + + assert { + condition = aws_sqs_queue.processing.receive_wait_time_seconds == 20 + error_message = "The processing queue must use 20-second long polling." + } + + assert { + condition = aws_sqs_queue.processing_dlq.receive_wait_time_seconds == 20 + error_message = "The processing DLQ must use 20-second long polling." + } +} + +run "dev_worker_can_be_enabled_explicitly" { + command = plan + + variables { + environment = "dev" + enable_sqs_worker = true + } + + assert { + condition = aws_lambda_event_source_mapping.processor_sqs.enabled == true + error_message = "The dev SQS worker should run when explicitly enabled." + } +} + +run "production_worker_remains_enabled_by_default" { + command = plan + + variables { + environment = "prod" + } + + assert { + condition = aws_lambda_event_source_mapping.processor_sqs.enabled == true + error_message = "The production SQS worker must remain enabled by default." + } +} diff --git a/grantstack-backend/terraform/variables.tf b/grantstack-backend/terraform/variables.tf index a76563e..cec11d3 100644 --- a/grantstack-backend/terraform/variables.tf +++ b/grantstack-backend/terraform/variables.tf @@ -71,6 +71,13 @@ variable "sqs_visibility_timeout_seconds" { default = 960 } +variable "enable_sqs_worker" { + description = "Enable the Lambda SQS processor poller. Defaults to false in dev and true in other environments." + type = bool + default = null + nullable = true +} + variable "sqs_message_retention_seconds" { description = "Processing queue message retention in seconds." type = number