From 5b5f342918cc2db01de6aeec0aaf8e73acdf3b96 Mon Sep 17 00:00:00 2001 From: Everett Smith Date: Thu, 24 Jul 2025 10:38:39 -0700 Subject: [PATCH 1/6] Migrate from aws_launch_configuration to aws_launch_template --- modules/aws_ecs/ecs.tf | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/modules/aws_ecs/ecs.tf b/modules/aws_ecs/ecs.tf index 4e16958..5efb3ec 100644 --- a/modules/aws_ecs/ecs.tf +++ b/modules/aws_ecs/ecs.tf @@ -37,28 +37,29 @@ data "aws_ami" "this" { ] } -resource "aws_launch_configuration" "this" { +resource "aws_launch_template" "this" { count = var.launch_type == "EC2" ? 1 : 0 - name_prefix = "${var.deployment_name}-ecs-launch-configuration-" + name_prefix = "${var.deployment_name}-ecs-launch-template-" image_id = data.aws_ami.this.id instance_type = var.instance_type # e.g. t2.medium - enable_monitoring = true - associate_public_ip_address = true + monitoring { + enabled = true + } + + network_interfaces { + associate_public_ip_address = false + security_groups = [aws_security_group.containers.id] + } # This user data represents a collection of “scripts” that will be executed the first time the machine starts. # This specific example makes sure the EC2 instance is automatically attached to the ECS cluster that we create earlier # and marks the instance as purchased through the Spot pricing - user_data = <<-EOF - #!/bin/bash - echo ECS_CLUSTER=${var.deployment_name}-ecs >> /etc/ecs/ecs.config - EOF - - # We’ll see security groups later - security_groups = [ - aws_security_group.containers.id - ] - + user_data = base64encod(<<-EOF + #!/bin/bash + echo ECS_CLUSTER=${var.deployment_name}-ecs >> /etc/ecs/ecs.config + EOF + ) # If you want to SSH into the instance and manage it directly: # 1. Make sure this key exists in the AWS EC2 dashboard # 2. Make sure your local SSH agent has it loaded @@ -80,10 +81,15 @@ resource "aws_autoscaling_group" "this" { min_size = var.min_instance_count desired_capacity = var.min_instance_count vpc_zone_identifier = var.private_subnet_ids - launch_configuration = aws_launch_configuration.this[0].name + + launch_template { + id = aws_launch_template.this[0].id + version = "$Latest" + } default_cooldown = 30 health_check_grace_period = 30 + health_check_type = "EC2" termination_policies = [ "OldestInstance" From ee7444e2bacee243143b976d18c137a09c72cf8f Mon Sep 17 00:00:00 2001 From: Everett Smith Date: Thu, 24 Jul 2025 10:52:59 -0700 Subject: [PATCH 2/6] Migrate EC2 launch type to awsvpc network mode --- modules/aws_ecs/loadbalancers.tf | 2 +- modules/aws_ecs/main.tf | 109 ++++++++++--------------------- 2 files changed, 37 insertions(+), 74 deletions(-) diff --git a/modules/aws_ecs/loadbalancers.tf b/modules/aws_ecs/loadbalancers.tf index 1d0d2ca..3f3e336 100644 --- a/modules/aws_ecs/loadbalancers.tf +++ b/modules/aws_ecs/loadbalancers.tf @@ -58,7 +58,7 @@ resource "aws_lb_target_group" "this" { deregistration_delay = 30 port = 3000 protocol = "HTTP" - target_type = var.launch_type == "FARGATE" ? "ip" : "instance" + target_type = "ip" health_check { interval = 61 diff --git a/modules/aws_ecs/main.tf b/modules/aws_ecs/main.tf index b89b2a7..d030128 100644 --- a/modules/aws_ecs/main.tf +++ b/modules/aws_ecs/main.tf @@ -54,7 +54,6 @@ resource "aws_ecs_service" "retool" { desired_count = var.min_instance_count - 1 deployment_maximum_percent = var.maximum_percent deployment_minimum_healthy_percent = var.minimum_healthy_percent - iam_role = var.launch_type == "EC2" ? aws_iam_role.service_role.arn : null propagate_tags = var.task_propagate_tags enable_execute_command = var.enable_execute_command @@ -71,16 +70,10 @@ resource "aws_ecs_service" "retool" { capacity_provider = var.launch_type == "FARGATE" ? "FARGATE" : aws_ecs_capacity_provider.this[0].name } - dynamic "network_configuration" { - for_each = var.launch_type == "FARGATE" ? toset([1]) : toset([]) - - content { - subnets = var.private_subnet_ids - security_groups = [ - aws_security_group.containers.id - ] - assign_public_ip = true - } + network_configuration { + subnets = var.private_subnet_ids + security_groups = [aws_security_group.containers.id] + assign_public_ip = false } } @@ -99,16 +92,10 @@ resource "aws_ecs_service" "jobs_runner" { capacity_provider = var.launch_type == "FARGATE" ? "FARGATE" : aws_ecs_capacity_provider.this[0].name } - dynamic "network_configuration" { - for_each = var.launch_type == "FARGATE" ? toset([1]) : toset([]) - - content { - subnets = var.private_subnet_ids - security_groups = [ - aws_security_group.containers.id - ] - assign_public_ip = true - } + network_configuration { + subnets = var.private_subnet_ids + security_groups = [aws_security_group.containers.id] + assign_public_ip = false } } @@ -132,16 +119,10 @@ resource "aws_ecs_service" "workflows_backend" { registry_arn = aws_service_discovery_service.retool_workflow_backend_service[0].arn } - dynamic "network_configuration" { - for_each = var.launch_type == "FARGATE" ? toset([1]) : toset([]) - - content { - subnets = var.private_subnet_ids - security_groups = [ - aws_security_group.containers.id - ] - assign_public_ip = true - } + network_configuration { + subnets = var.private_subnet_ids + security_groups = [aws_security_group.containers.id] + assign_public_ip = false } } @@ -161,16 +142,10 @@ resource "aws_ecs_service" "workflows_worker" { capacity_provider = var.launch_type == "FARGATE" ? "FARGATE" : aws_ecs_capacity_provider.this[0].name } - dynamic "network_configuration" { - for_each = var.launch_type == "FARGATE" ? toset([1]) : toset([]) - - content { - subnets = var.private_subnet_ids - security_groups = [ - aws_security_group.containers.id - ] - assign_public_ip = true - } + network_configuration { + subnets = var.private_subnet_ids + security_groups = [aws_security_group.containers.id] + assign_public_ip = false } } @@ -193,16 +168,10 @@ resource "aws_ecs_service" "code_executor" { registry_arn = aws_service_discovery_service.retool_code_executor_service[0].arn } - dynamic "network_configuration" { - for_each = var.launch_type == "FARGATE" ? toset([1]) : toset([]) - - content { - subnets = var.private_subnet_ids - security_groups = [ - aws_security_group.containers.id - ] - assign_public_ip = true - } + network_configuration { + subnets = var.private_subnet_ids + security_groups = [aws_security_group.containers.id] + assign_public_ip = false } } @@ -226,16 +195,10 @@ resource "aws_ecs_service" "telemetry" { registry_arn = aws_service_discovery_service.retool_telemetry_service[0].arn } - dynamic "network_configuration" { - for_each = var.launch_type == "FARGATE" ? toset([1]) : toset([]) - - content { - subnets = var.private_subnet_ids - security_groups = [ - aws_security_group.containers.id - ] - assign_public_ip = true - } + network_configuration { + subnets = var.private_subnet_ids + security_groups = [aws_security_group.containers.id] + assign_public_ip = false } } @@ -243,8 +206,8 @@ resource "aws_ecs_task_definition" "retool_jobs_runner" { family = "retool-jobs-runner" task_role_arn = aws_iam_role.task_role.arn execution_role_arn = var.launch_type == "FARGATE" ? aws_iam_role.execution_role[0].arn : null - requires_compatibilities = var.launch_type == "FARGATE" ? ["FARGATE"] : null - network_mode = var.launch_type == "FARGATE" ? "awsvpc" : "bridge" + requires_compatibilities = var.launch_type == "FARGATE" ? ["FARGATE"] : ["EC2"] + network_mode = "awsvpc" cpu = var.launch_type == "FARGATE" ? var.ecs_task_resource_map["jobs_runner"]["cpu"] : null memory = var.launch_type == "FARGATE" ? var.ecs_task_resource_map["jobs_runner"]["memory"] : null container_definitions = jsonencode(concat( @@ -288,8 +251,8 @@ resource "aws_ecs_task_definition" "retool" { family = "retool" task_role_arn = aws_iam_role.task_role.arn execution_role_arn = var.launch_type == "FARGATE" ? aws_iam_role.execution_role[0].arn : null - requires_compatibilities = var.launch_type == "FARGATE" ? ["FARGATE"] : null - network_mode = var.launch_type == "FARGATE" ? "awsvpc" : "bridge" + requires_compatibilities = var.launch_type == "FARGATE" ? ["FARGATE"] : ["EC2"] + network_mode = "awsvpc" cpu = var.launch_type == "FARGATE" ? var.ecs_task_resource_map["main"]["cpu"] : null memory = var.launch_type == "FARGATE" ? var.ecs_task_resource_map["main"]["memory"] : null @@ -339,8 +302,8 @@ resource "aws_ecs_task_definition" "retool_workflows_backend" { family = "retool-workflows-backend" task_role_arn = aws_iam_role.task_role.arn execution_role_arn = var.launch_type == "FARGATE" ? aws_iam_role.execution_role[0].arn : null - requires_compatibilities = var.launch_type == "FARGATE" ? ["FARGATE"] : null - network_mode = var.launch_type == "FARGATE" ? "awsvpc" : "bridge" + requires_compatibilities = var.launch_type == "FARGATE" ? ["FARGATE"] : ["EC2"] + network_mode = "awsvpc" cpu = var.launch_type == "FARGATE" ? var.ecs_task_resource_map["workflows_backend"]["cpu"] : null memory = var.launch_type == "FARGATE" ? var.ecs_task_resource_map["workflows_backend"]["memory"] : null @@ -390,8 +353,8 @@ resource "aws_ecs_task_definition" "retool_workflows_worker" { family = "retool-workflows-worker" task_role_arn = aws_iam_role.task_role.arn execution_role_arn = var.launch_type == "FARGATE" ? aws_iam_role.execution_role[0].arn : null - requires_compatibilities = var.launch_type == "FARGATE" ? ["FARGATE"] : null - network_mode = var.launch_type == "FARGATE" ? "awsvpc" : "bridge" + requires_compatibilities = var.launch_type == "FARGATE" ? ["FARGATE"] : ["EC2"] + network_mode = "awsvpc" cpu = var.launch_type == "FARGATE" ? var.ecs_task_resource_map["workflows_worker"]["cpu"] : null memory = var.launch_type == "FARGATE" ? var.ecs_task_resource_map["workflows_worker"]["memory"] : null @@ -445,8 +408,8 @@ resource "aws_ecs_task_definition" "retool_code_executor" { family = "retool-code-executor" task_role_arn = aws_iam_role.task_role.arn execution_role_arn = var.launch_type == "FARGATE" ? aws_iam_role.execution_role[0].arn : null - requires_compatibilities = var.launch_type == "FARGATE" ? ["FARGATE"] : null - network_mode = var.launch_type == "FARGATE" ? "awsvpc" : "bridge" + requires_compatibilities = var.launch_type == "FARGATE" ? ["FARGATE"] : ["EC2"] + network_mode = "awsvpc" cpu = var.launch_type == "FARGATE" ? var.ecs_task_resource_map["code_executor"]["cpu"] : null memory = var.launch_type == "FARGATE" ? var.ecs_task_resource_map["code_executor"]["memory"] : null @@ -508,8 +471,8 @@ resource "aws_ecs_task_definition" "retool_telemetry" { family = "retool-telemetry" task_role_arn = aws_iam_role.task_role.arn execution_role_arn = var.launch_type == "FARGATE" ? aws_iam_role.execution_role[0].arn : null - requires_compatibilities = var.launch_type == "FARGATE" ? ["FARGATE"] : null - network_mode = var.launch_type == "FARGATE" ? "awsvpc" : "bridge" + requires_compatibilities = var.launch_type == "FARGATE" ? ["FARGATE"] : ["EC2"] + network_mode = "awsvpc" cpu = var.launch_type == "FARGATE" ? var.ecs_task_resource_map["telemetry"]["cpu"] : null memory = var.launch_type == "FARGATE" ? var.ecs_task_resource_map["telemetry"]["memory"] : null From 0895fefe62c87a37fc37eb8716904e9bb710f8d5 Mon Sep 17 00:00:00 2001 From: Everett Smith Date: Thu, 24 Jul 2025 11:13:34 -0700 Subject: [PATCH 3/6] Fix EC2 launch type autoscaling defaults --- modules/aws_ecs/ecs.tf | 236 +++++++++++++++++++++++++++++++---- modules/aws_ecs/locals.tf | 11 +- modules/aws_ecs/main.tf | 112 +++++++++-------- modules/aws_ecs/variables.tf | 67 +++++++++- 4 files changed, 344 insertions(+), 82 deletions(-) diff --git a/modules/aws_ecs/ecs.tf b/modules/aws_ecs/ecs.tf index 5efb3ec..6e473c9 100644 --- a/modules/aws_ecs/ecs.tf +++ b/modules/aws_ecs/ecs.tf @@ -118,28 +118,6 @@ resource "aws_autoscaling_group" "this" { } } -# Attach an autoscaling policy to the spot cluster to target 70% MemoryReservation on the ECS cluster. -resource "aws_autoscaling_policy" "this" { - count = var.launch_type == "EC2" ? 1 : 0 - name = "${var.deployment_name}-ecs-scale-policy" - policy_type = "TargetTrackingScaling" - adjustment_type = "ChangeInCapacity" - autoscaling_group_name = aws_autoscaling_group.this[0].name - - target_tracking_configuration { - customized_metric_specification { - metric_dimension { - name = "ClusterName" - value = "${var.deployment_name}-ecs" - } - metric_name = "MemoryReservation" - namespace = "AWS/ECS" - statistic = "Average" - } - target_value = var.autoscaling_memory_reservation_target - } -} - resource "aws_ecs_capacity_provider" "this" { count = var.launch_type == "EC2" ? 1 : 0 name = "${var.deployment_name}-ecs-capacity-provider" @@ -147,4 +125,218 @@ resource "aws_ecs_capacity_provider" "this" { auto_scaling_group_provider { auto_scaling_group_arn = aws_autoscaling_group.this[0].arn } + + managed_scaling { + status = "ENABLED" + target_capacity = 80 + minimum_scaling_step_size = 1 + maximum_scaling_step_size = 2 + instance_warmup_period = 300 + } +} + +resource "aws_appautoscaling_target" "retool" { + count = var.launch_type == "EC2" ? 1 : 0 + service_namespace = "ecs" + resource_id = "service/${aws_ecs_cluster.this.name}/${var.deployment_name}-main-backend-service" + scalable_dimension = "ecs:service:DesiredCount" + min_capacity = 1 + max_capacity = 3 + depends_on = [aws_ecs_service.retool] +} + +resource "aws_appautoscaling_target" "workflows_worker" { + count = (var.launch_type == "EC2" && var.workflows_enabled) ? 1 : 0 + service_namespace = "ecs" + resource_id = "service/${aws_ecs_cluster.this.name}/${var.deployment_name}-workflows-worker-service" + scalable_dimension = "ecs:service:DesiredCount" + min_capacity = 1 + max_capacity = 3 + depends_on = [aws_ecs_service.workflows_worker] +} + +resource "aws_appautoscaling_target" "workflows_backend" { + count = (var.launch_type == "EC2" && var.workflows_enabled) ? 1 : 0 + service_namespace = "ecs" + resource_id = "service/${aws_ecs_cluster.this.name}/${var.deployment_name}-workflows-backend-service" + scalable_dimension = "ecs:service:DesiredCount" + min_capacity = 1 + max_capacity = 3 + depends_on = [aws_ecs_service.workflows_backend] +} + +resource "aws_appautoscaling_target" "code_executor" { + count = (var.launch_type == "EC2" && var.code_executor_enabled) ? 1 : 0 + service_namespace = "ecs" + resource_id = "service/${aws_ecs_cluster.this.name}/${var.deployment_name}-code-executor-service" + scalable_dimension = "ecs:service:DesiredCount" + min_capacity = 1 + max_capacity = 3 + depends_on = [aws_ecs_service.code_executor] +} + +resource "aws_appautoscaling_policy" "retool_cpu" { + count = var.launch_type == "EC2" ? 1 : 0 + name = "retool-cpu-policy" + service_namespace = "ecs" + resource_id = aws_appautoscaling_target.retool[0].resource_id + scalable_dimension = aws_appautoscaling_target.retool[0].scalable_dimension + policy_type = "TargetTrackingScaling" + + target_tracking_scaling_policy_configuration { + target_value = 60.0 + predefined_metric_specification { + predefined_metric_type = "ECSServiceAverageCPUUtilization" + } + scale_in_cooldown = 60 + scale_out_cooldown = 60 + } +} + +resource "aws_appautoscaling_policy" "workflows_worker_cpu" { + count = (var.launch_type == "EC2" && var.workflows_enabled) ? 1 : 0 + name = "workflows-worker-cpu-policy" + service_namespace = "ecs" + resource_id = aws_appautoscaling_target.workflows_worker[0].resource_id + scalable_dimension = aws_appautoscaling_target.workflows_worker[0].scalable_dimension + policy_type = "TargetTrackingScaling" + + target_tracking_scaling_policy_configuration { + target_value = 60.0 + predefined_metric_specification { + predefined_metric_type = "ECSServiceAverageCPUUtilization" + } + scale_in_cooldown = 60 + scale_out_cooldown = 30 + } +} + +resource "aws_appautoscaling_policy" "workflows_backend_cpu" { + count = (var.launch_type == "EC2" && var.workflows_enabled) ? 1 : 0 + name = "workflows_backend-cpu-policy" + service_namespace = "ecs" + resource_id = aws_appautoscaling_target.workflows_backend[0].resource_id + scalable_dimension = aws_appautoscaling_target.workflows_backend[0].scalable_dimension + policy_type = "TargetTrackingScaling" + + target_tracking_scaling_policy_configuration { + target_value = 60.0 + predefined_metric_specification { + predefined_metric_type = "ECSServiceAverageCPUUtilization" + } + scale_in_cooldown = 60 + scale_out_cooldown = 60 + } +} + +resource "aws_appautoscaling_policy" "code_executor_cpu" { + count = (var.launch_type == "EC2" && var.code_executor_enabled) ? 1 : 0 + name = "code-executor-cpu-policy" + service_namespace = "ecs" + resource_id = aws_appautoscaling_target.code_executor[0].resource_id + scalable_dimension = aws_appautoscaling_target.code_executor[0].scalable_dimension + policy_type = "TargetTrackingScaling" + + target_tracking_scaling_policy_configuration { + target_value = 60.0 + predefined_metric_specification { + predefined_metric_type = "ECSServiceAverageCPUUtilization" + } + scale_in_cooldown = 60 + scale_out_cooldown = 60 + } +} + +resource "aws_appautoscaling_policy" "retool_memory" { + count = var.launch_type == "EC2" ? 1 : 0 + name = "retool-memory-policy" + service_namespace = "ecs" + resource_id = aws_appautoscaling_target.retool[0].resource_id + scalable_dimension = aws_appautoscaling_target.retool[0].scalable_dimension + policy_type = "TargetTrackingScaling" + + target_tracking_scaling_policy_configuration { + target_value = 70.0 + predefined_metric_specification { + predefined_metric_type = "ECSServiceAverageMemoryUtilization" + } + scale_in_cooldown = 60 + scale_out_cooldown = 60 + } +} + +resource "aws_appautoscaling_policy" "workflows_worker_memory" { + count = (var.launch_type == "EC2" && var.workflows_enabled) ? 1 : 0 + name = "workflows-worker-memory-policy" + service_namespace = "ecs" + resource_id = aws_appautoscaling_target.workflows_worker[0].resource_id + scalable_dimension = aws_appautoscaling_target.workflows_worker[0].scalable_dimension + policy_type = "TargetTrackingScaling" + + target_tracking_scaling_policy_configuration { + target_value = 70.0 + predefined_metric_specification { + predefined_metric_type = "ECSServiceAverageMemoryUtilization" + } + scale_in_cooldown = 60 + scale_out_cooldown = 60 + } +} + +resource "aws_appautoscaling_policy" "workflows_backend_memory" { + count = (var.launch_type == "EC2" && var.workflows_enabled) ? 1 : 0 + name = "workflows_backend-memory-policy" + service_namespace = "ecs" + resource_id = aws_appautoscaling_target.workflows_backend[0].resource_id + scalable_dimension = aws_appautoscaling_target.workflows_backend[0].scalable_dimension + policy_type = "TargetTrackingScaling" + + target_tracking_scaling_policy_configuration { + target_value = 70.0 + predefined_metric_specification { + predefined_metric_type = "ECSServiceAverageMemoryUtilization" + } + scale_in_cooldown = 60 + scale_out_cooldown = 60 + } } + +resource "aws_appautoscaling_policy" "code_executor_memory" { + count = (var.launch_type == "EC2" && var.code_executor_enabled) ? 1 : 0 + name = "code-executor-memory-policy" + service_namespace = "ecs" + resource_id = aws_appautoscaling_target.code_executor[0].resource_id + scalable_dimension = aws_appautoscaling_target.code_executor[0].scalable_dimension + policy_type = "TargetTrackingScaling" + + target_tracking_scaling_policy_configuration { + target_value = 70.0 + predefined_metric_specification { + predefined_metric_type = "ECSServiceAverageMemoryUtilization" + } + scale_in_cooldown = 60 + scale_out_cooldown = 60 + } +} + +# Attach an autoscaling policy to the spot cluster to target 70% MemoryReservation on the ECS cluster. +# resource "aws_autoscaling_policy" "this" { +# count = var.launch_type == "EC2" ? 1 : 0 +# name = "${var.deployment_name}-ecs-scale-policy" +# policy_type = "TargetTrackingScaling" +# adjustment_type = "ChangeInCapacity" +# autoscaling_group_name = aws_autoscaling_group.this[0].name +# +# target_tracking_configuration { +# customized_metric_specification { +# metric_dimension { +# name = "ClusterName" +# value = "${var.deployment_name}-ecs" +# } +# metric_name = "MemoryReservation" +# namespace = "AWS/ECS" +# statistic = "Average" +# } +# target_value = var.autoscaling_memory_reservation_target +# } +# } diff --git a/modules/aws_ecs/locals.tf b/modules/aws_ecs/locals.tf index a58c2bb..98ff864 100644 --- a/modules/aws_ecs/locals.tf +++ b/modules/aws_ecs/locals.tf @@ -136,11 +136,12 @@ locals { common_containers = ( var.telemetry_enabled ? [ { - name = "retool-fluentbit" - essential = true - image = var.ecs_telemetry_fluentbit_image - cpu = var.launch_type == "EC2" ? var.ecs_task_resource_map["fluentbit"]["cpu"] : null - memory = var.launch_type == "EC2" ? var.ecs_task_resource_map["fluentbit"]["memory"] : null + name = "retool-fluentbit" + essential = true + image = var.ecs_telemetry_fluentbit_image + cpu = var.launch_type == "EC2" ? var.ec2_task_resource_map["fluentbit"]["cpu"] : null + memory = var.launch_type == "EC2" ? var.ec2_task_resource_map["fluentbit"]["memory"] : null + memoryReservation = var.launch_type == "EC2" ? var.ec2_task_resource_map["fluentbit"]["memory"] : null firelensConfiguration = { type = "fluentbit" diff --git a/modules/aws_ecs/main.tf b/modules/aws_ecs/main.tf index d030128..25f8120 100644 --- a/modules/aws_ecs/main.tf +++ b/modules/aws_ecs/main.tf @@ -80,10 +80,13 @@ resource "aws_ecs_service" "retool" { resource "aws_ecs_service" "jobs_runner" { name = "${var.deployment_name}-jobs-runner-service" cluster = aws_ecs_cluster.this.id + # desired_count is set to 1 since the Jobs Runner must be run as a singleton. desired_count = 1 task_definition = aws_ecs_task_definition.retool_jobs_runner.arn propagate_tags = var.task_propagate_tags enable_execute_command = var.enable_execute_command + deployment_minimum_healthy_percent = 0 + deployment_maximum_percent = 100 # Need to explictly set this in aws_ecs_service to avoid destructive behavior: https://github.com/hashicorp/terraform-provider-aws/issues/22823 capacity_provider_strategy { @@ -103,10 +106,11 @@ resource "aws_ecs_service" "workflows_backend" { count = var.workflows_enabled ? 1 : 0 name = "${var.deployment_name}-workflows-backend-service" cluster = aws_ecs_cluster.this.id - desired_count = 1 task_definition = aws_ecs_task_definition.retool_workflows_backend[0].arn propagate_tags = var.task_propagate_tags enable_execute_command = var.enable_execute_command + deployment_maximum_percent = var.maximum_percent + deployment_minimum_healthy_percent = var.minimum_healthy_percent # Need to explictly set this in aws_ecs_service to avoid destructive behavior: https://github.com/hashicorp/terraform-provider-aws/issues/22823 capacity_provider_strategy { @@ -130,8 +134,9 @@ resource "aws_ecs_service" "workflows_worker" { count = var.workflows_enabled ? 1 : 0 name = "${var.deployment_name}-workflows-worker-service" cluster = aws_ecs_cluster.this.id - desired_count = 1 task_definition = aws_ecs_task_definition.retool_workflows_worker[0].arn + deployment_maximum_percent = var.maximum_percent + deployment_minimum_healthy_percent = var.minimum_healthy_percent propagate_tags = var.task_propagate_tags enable_execute_command = var.enable_execute_command @@ -153,9 +158,10 @@ resource "aws_ecs_service" "code_executor" { count = var.code_executor_enabled ? 1 : 0 name = "${var.deployment_name}-code-executor-service" cluster = aws_ecs_cluster.this.id - desired_count = 1 task_definition = aws_ecs_task_definition.retool_code_executor[0].arn enable_execute_command = var.enable_execute_command + deployment_maximum_percent = var.maximum_percent + deployment_minimum_healthy_percent = var.minimum_healthy_percent # Need to explictly set this in aws_ecs_service to avoid destructive behavior: https://github.com/hashicorp/terraform-provider-aws/issues/22823 capacity_provider_strategy { @@ -208,18 +214,19 @@ resource "aws_ecs_task_definition" "retool_jobs_runner" { execution_role_arn = var.launch_type == "FARGATE" ? aws_iam_role.execution_role[0].arn : null requires_compatibilities = var.launch_type == "FARGATE" ? ["FARGATE"] : ["EC2"] network_mode = "awsvpc" - cpu = var.launch_type == "FARGATE" ? var.ecs_task_resource_map["jobs_runner"]["cpu"] : null - memory = var.launch_type == "FARGATE" ? var.ecs_task_resource_map["jobs_runner"]["memory"] : null + cpu = var.launch_type == "FARGATE" ? var.fargate_task_resource_map["jobs_runner"]["cpu"] : null + memory = var.launch_type == "FARGATE" ? var.fargate_task_resource_map["jobs_runner"]["memory"] : null container_definitions = jsonencode(concat( local.common_containers, [ { - name = "retool-jobs-runner" - essential = true - image = var.ecs_retool_image - cpu = var.launch_type == "EC2" ? var.ecs_task_resource_map["jobs_runner"]["cpu"] : null - memory = var.launch_type == "EC2" ? var.ecs_task_resource_map["jobs_runner"]["memory"] : null - command = [ + name = "retool-jobs-runner" + essential = true + image = var.ecs_retool_image + cpu = var.launch_type == "EC2" ? var.ec2_task_resource_map["jobs_runner"]["cpu"] : null + memory = var.launch_type == "EC2" ? var.ec2_task_resource_map["jobs_runner"]["memory"] : null + memoryReservation = var.launch_type == "EC2" ? var.ec2_task_resource_map["jobs_runner"]["memoryReservation"] : null + command = [ "./docker_scripts/start_api.sh" ] @@ -253,19 +260,20 @@ resource "aws_ecs_task_definition" "retool" { execution_role_arn = var.launch_type == "FARGATE" ? aws_iam_role.execution_role[0].arn : null requires_compatibilities = var.launch_type == "FARGATE" ? ["FARGATE"] : ["EC2"] network_mode = "awsvpc" - cpu = var.launch_type == "FARGATE" ? var.ecs_task_resource_map["main"]["cpu"] : null - memory = var.launch_type == "FARGATE" ? var.ecs_task_resource_map["main"]["memory"] : null + cpu = var.launch_type == "FARGATE" ? var.fargate_task_resource_map["main"]["cpu"] : null + memory = var.launch_type == "FARGATE" ? var.fargate_task_resource_map["main"]["memory"] : null container_definitions = jsonencode(concat( local.common_containers, [ { - name = "retool" - essential = true - image = var.ecs_retool_image - cpu = var.launch_type == "EC2" ? var.ecs_task_resource_map["main"]["cpu"] : null - memory = var.launch_type == "EC2" ? var.ecs_task_resource_map["main"]["memory"] : null - command = [ + name = "retool" + essential = true + image = var.ecs_retool_image + cpu = var.launch_type == "EC2" ? var.ec2_task_resource_map["main"]["cpu"] : null + memory = var.launch_type == "EC2" ? var.ec2_task_resource_map["main"]["memory"] : null + memoryReservation = var.launch_type == "EC2" ? var.ec2_task_resource_map["main"]["memoryReservation"] : null + command = [ "./docker_scripts/start_api.sh" ] @@ -304,19 +312,20 @@ resource "aws_ecs_task_definition" "retool_workflows_backend" { execution_role_arn = var.launch_type == "FARGATE" ? aws_iam_role.execution_role[0].arn : null requires_compatibilities = var.launch_type == "FARGATE" ? ["FARGATE"] : ["EC2"] network_mode = "awsvpc" - cpu = var.launch_type == "FARGATE" ? var.ecs_task_resource_map["workflows_backend"]["cpu"] : null - memory = var.launch_type == "FARGATE" ? var.ecs_task_resource_map["workflows_backend"]["memory"] : null + cpu = var.launch_type == "FARGATE" ? var.fargate_task_resource_map["workflows_backend"]["cpu"] : null + memory = var.launch_type == "FARGATE" ? var.fargate_task_resource_map["workflows_backend"]["memory"] : null container_definitions = jsonencode(concat( local.common_containers, [ { - name = "retool-workflows-backend" - essential = true - image = var.ecs_retool_image - cpu = var.launch_type == "EC2" ? var.ecs_task_resource_map["workflows_backend"]["cpu"] : null - memory = var.launch_type == "EC2" ? var.ecs_task_resource_map["workflows_backend"]["memory"] : null - command = [ + name = "retool-workflows-backend" + essential = true + image = var.ecs_retool_image + cpu = var.launch_type == "EC2" ? var.ec2_task_resource_map["workflows_backend"]["cpu"] : null + memory = var.launch_type == "EC2" ? var.ec2_task_resource_map["workflows_backend"]["memory"] : null + memoryReservation = var.launch_type == "EC2" ? var.ec2_task_resource_map["workflows_backend"]["memoryReservation"] : null + command = [ "./docker_scripts/start_api.sh" ] @@ -355,19 +364,20 @@ resource "aws_ecs_task_definition" "retool_workflows_worker" { execution_role_arn = var.launch_type == "FARGATE" ? aws_iam_role.execution_role[0].arn : null requires_compatibilities = var.launch_type == "FARGATE" ? ["FARGATE"] : ["EC2"] network_mode = "awsvpc" - cpu = var.launch_type == "FARGATE" ? var.ecs_task_resource_map["workflows_worker"]["cpu"] : null - memory = var.launch_type == "FARGATE" ? var.ecs_task_resource_map["workflows_worker"]["memory"] : null + cpu = var.launch_type == "FARGATE" ? var.fargate_task_resource_map["code_executor"]["cpu"] : null + memory = var.launch_type == "FARGATE" ? var.fargate_task_resource_map["code_executor"]["memory"] : null container_definitions = jsonencode(concat( local.common_containers, [ { - name = "retool-workflows-worker" - essential = true - image = var.ecs_retool_image - cpu = var.launch_type == "EC2" ? var.ecs_task_resource_map["workflows_worker"]["cpu"] : null - memory = var.launch_type == "EC2" ? var.ecs_task_resource_map["workflows_worker"]["memory"] : null - command = [ + name = "retool-workflows-worker" + essential = true + image = var.ecs_retool_image + cpu = var.launch_type == "EC2" ? var.ec2_task_resource_map["code_executor"]["cpu"] : null + memory = var.launch_type == "EC2" ? var.ec2_task_resource_map["code_executor"]["memory"] : null + memoryReservation = var.launch_type == "EC2" ? var.ec2_task_resource_map["code_executor"]["memoryReservation"] : null + command = [ "./docker_scripts/start_api.sh" ] @@ -410,19 +420,20 @@ resource "aws_ecs_task_definition" "retool_code_executor" { execution_role_arn = var.launch_type == "FARGATE" ? aws_iam_role.execution_role[0].arn : null requires_compatibilities = var.launch_type == "FARGATE" ? ["FARGATE"] : ["EC2"] network_mode = "awsvpc" - cpu = var.launch_type == "FARGATE" ? var.ecs_task_resource_map["code_executor"]["cpu"] : null - memory = var.launch_type == "FARGATE" ? var.ecs_task_resource_map["code_executor"]["memory"] : null + cpu = var.launch_type == "FARGATE" ? var.fargate_task_resource_map["telemetry"]["cpu"] : null + memory = var.launch_type == "FARGATE" ? var.fargate_task_resource_map["telemetry"]["memory"] : null container_definitions = jsonencode(concat( local.common_containers, [ { - name = "retool-code-executor" - essential = true - image = local.ecs_code_executor_image - cpu = var.launch_type == "EC2" ? var.ecs_task_resource_map["code_executor"]["cpu"] : null - memory = var.launch_type == "EC2" ? var.ecs_task_resource_map["code_executor"]["memory"] : null - user = var.launch_type == "EC2" ? null : "1001:1001" + name = "retool-code-executor" + essential = true + image = local.ecs_code_executor_image + cpu = var.launch_type == "EC2" ? var.ec2_task_resource_map["telemetry"]["cpu"] : null + memory = var.launch_type == "EC2" ? var.ec2_task_resource_map["telemetry"]["memory"] : null + memoryReservation = var.launch_type == "EC2" ? var.ec2_task_resource_map["telemetry"]["memoryReservation"] : null + user = var.launch_type == "EC2" ? null : "1001:1001" # required to use nsjail sandboxing, which is required for custom libraries for JS and Python # Learn more here: https://docs.retool.com/self-hosted/concepts/architecture#code-executor # If not using nsjail sandboxing, update this to be false and use user = "1001:1001" @@ -473,17 +484,18 @@ resource "aws_ecs_task_definition" "retool_telemetry" { execution_role_arn = var.launch_type == "FARGATE" ? aws_iam_role.execution_role[0].arn : null requires_compatibilities = var.launch_type == "FARGATE" ? ["FARGATE"] : ["EC2"] network_mode = "awsvpc" - cpu = var.launch_type == "FARGATE" ? var.ecs_task_resource_map["telemetry"]["cpu"] : null - memory = var.launch_type == "FARGATE" ? var.ecs_task_resource_map["telemetry"]["memory"] : null + cpu = var.launch_type == "FARGATE" ? var.fargate_task_resource_map["telemetry"]["cpu"] : null + memory = var.launch_type == "FARGATE" ? var.fargate_task_resource_map["telemetry"]["memory"] : null container_definitions = jsonencode( [ { - name = "retool-telemetry" - essential = true - image = local.ecs_telemetry_image - cpu = var.launch_type == "EC2" ? var.ecs_task_resource_map["telemetry"]["cpu"] : null - memory = var.launch_type == "EC2" ? var.ecs_task_resource_map["telemetry"]["memory"] : null + name = "retool-telemetry" + essential = true + image = local.ecs_telemetry_image + cpu = var.launch_type == "EC2" ? var.ec2_task_resource_map["telemetry"]["cpu"] : null + memory = var.launch_type == "EC2" ? var.ec2_task_resource_map["telemetry"]["memory"] : null + memoryReservation = var.launch_type == "EC2" ? var.ec2_task_resource_map["telemetry"]["memoryReservation"] : null command = [ "retool-telemetry" ] diff --git a/modules/aws_ecs/variables.tf b/modules/aws_ecs/variables.tf index 81651d9..0e09be3 100644 --- a/modules/aws_ecs/variables.tf +++ b/modules/aws_ecs/variables.tf @@ -45,7 +45,7 @@ variable "max_instance_count" { variable "min_instance_count" { type = number - description = "Min/desired number of EC2 instances. Defaults to 4." + description = "Min/desired number of EC2 instances. Defaults to 3." default = 3 } @@ -97,7 +97,58 @@ variable "ecs_telemetry_fluentbit_image" { default = "tryretool/retool-aws-for-fluent-bit:3.120.0-edge" } -variable "ecs_task_resource_map" { +# ECS treats CPU and Memory differently between EC2 and Fargate launch types. +# Retool provides separate sane defaults for both launch types and the template will use the resource map for the configured launch type. +# With Fargate, ECS treats CPU and Memory as exact requests, but with EC2, ECS treats CPU as a soft limit, +# memory as a hard limit and supports the additional memoryReservation as a soft limit. + +variable "ec2_task_resource_map" { + type = map(object({ + cpu = number + memory = number + memoryReservation = number + })) + default = { + main = { + cpu = 2048 + memory = 4096 + memoryReservation = 3072 + }, + jobs_runner = { + cpu = 1024 + memory = 4096 + memoryReservation = 2048 + }, + workflows_backend = { + cpu = 2048 + memory = 4096 + memoryReservation = 3072 + } + workflows_worker = { + cpu = 1024 + memory = 4096 + memoryReservation = 2048 + } + code_executor = { + cpu = 1024 + memory = 4096 + memoryReservation = 2048 + } + telemetry = { + cpu = 1024 + memory = 4096 + memoryReservation = 2048 + } + fluentbit = { + cpu = 512 + memory = 2048 + memoryReservation = 1024 + } + } + description = "Amount of CPU and Memory provisioned for each task with EC2 launch type set." +} + +variable "fargate_task_resource_map" { type = map(object({ cpu = number memory = number @@ -108,8 +159,8 @@ variable "ecs_task_resource_map" { memory = 4096 }, jobs_runner = { - cpu = 1024 - memory = 2048 + cpu = 2048 + memory = 4096 }, workflows_backend = { cpu = 2048 @@ -132,7 +183,7 @@ variable "ecs_task_resource_map" { memory = 1024 } } - description = "Amount of CPU and Memory provisioned for each task." + description = "Amount of CPU and Memory provisioned for each task with Fargate launch type." } variable "temporal_ecs_task_resource_map" { @@ -451,6 +502,12 @@ variable "autoscaling_memory_reservation_target" { description = "Memory reservation target for the Autoscaling Group. Defaults to 70.0." } +variable "autoscaling_cpu_reservation_target" { + type = number + default = 60.0 + description = "Memory reservation target for the Autoscaling Group. Defaults to 60.0." +} + variable "additional_env_vars" { type = list(map(string)) default = [] From a67574cdc8a059eb0f97b0fb5cc3c0ce5d485525 Mon Sep 17 00:00:00 2001 From: Everett Smith Date: Thu, 24 Jul 2025 11:30:16 -0700 Subject: [PATCH 4/6] Only set env vars for local temporal when using local temporal --- modules/aws_ecs/locals.tf | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/modules/aws_ecs/locals.tf b/modules/aws_ecs/locals.tf index 98ff864..37cf598 100644 --- a/modules/aws_ecs/locals.tf +++ b/modules/aws_ecs/locals.tf @@ -26,13 +26,6 @@ locals { environment_variables = concat( var.additional_env_vars, # add additional environment variables local.base_environment_variables, - local.temporal_mtls_config, - var.code_executor_enabled ? [ - { - name = "CODE_EXECUTOR_INGRESS_DOMAIN" - value = format("http://code-executor.%s:3004", local.service_discovery_namespace) - } - ] : [], var.telemetry_enabled ? [ { name = "RTEL_ENABLED" @@ -88,11 +81,22 @@ locals { "name" : "LICENSE_KEY", "value" : var.retool_license_key }, - # Workflows-specific + # WORKFLOW_BACKEND_HOST and CODE_EXECUTOR_INGRESS_DOMAIN are workflows-specific services { "name" : "WORKFLOW_BACKEND_HOST", "value" : format("http://workflow-backend.%s:3000", local.service_discovery_namespace) - }, + } + ], + var.code_executor_enabled ? [ + { + name = "CODE_EXECUTOR_INGRESS_DOMAIN" + value = format("http://code-executor.%s:3004", local.service_discovery_namespace) + } + ] : [], + # The section below is only needed if deploying Temporal locally from this template. + # Retool strongly reccommends using the Retool Managed Temporal option instead. + local.temporal_mtls_config, + var.use_existing_temporal_cluster ? [] : [ { "name" : "WORKFLOW_TEMPORAL_CLUSTER_NAMESPACE", "value" : var.temporal_cluster_config.namespace From 8562d06b4863c18c0fdc3553c4ba624a0db6af3d Mon Sep 17 00:00:00 2001 From: Everett Smith Date: Thu, 24 Jul 2025 12:26:59 -0700 Subject: [PATCH 5/6] Update to support secrets --- modules/aws_ecs/locals.tf | 30 ++++++++++++++++++------------ modules/aws_ecs/main.tf | 22 ++++++++++++++++------ modules/aws_ecs/roles.tf | 30 +++++++++++++++++++++++++++--- modules/aws_ecs/variables.tf | 6 ++++++ 4 files changed, 67 insertions(+), 21 deletions(-) diff --git a/modules/aws_ecs/locals.tf b/modules/aws_ecs/locals.tf index 37cf598..dd65328 100644 --- a/modules/aws_ecs/locals.tf +++ b/modules/aws_ecs/locals.tf @@ -65,18 +65,6 @@ locals { "name" = "POSTGRES_USER", "value" = var.rds_username }, - { - "name" = "POSTGRES_PASSWORD", - "value" = random_string.rds_password.result - }, - { - "name" : "JWT_SECRET", - "value" : random_string.jwt_secret.result - }, - { - "name" : "ENCRYPTION_KEY", - "value" : random_string.encryption_key.result - }, { "name" : "LICENSE_KEY", "value" : var.retool_license_key @@ -116,6 +104,24 @@ locals { ] ) + secrets = concat( + var.additional_secrets, + [ + { + name = "POSTGRES_PASSWORD", + valueFrom = aws_secretsmanager_secret.rds_password.arn + }, + { + name = "JWT_SECRET", + valueFrom = aws_secretsmanager_secret.jwt_secret.arn + }, + { + name = "ENCRYPTION_KEY", + valueFrom = aws_secretsmanager_secret.encryption_key.arn + } + ] + ) + task_log_configuration = ( var.telemetry_enabled ? { # Send logs to CloudWatch in addition to telemetry service: diff --git a/modules/aws_ecs/main.tf b/modules/aws_ecs/main.tf index 25f8120..b42bfb5 100644 --- a/modules/aws_ecs/main.tf +++ b/modules/aws_ecs/main.tf @@ -211,7 +211,7 @@ resource "aws_ecs_service" "telemetry" { resource "aws_ecs_task_definition" "retool_jobs_runner" { family = "retool-jobs-runner" task_role_arn = aws_iam_role.task_role.arn - execution_role_arn = var.launch_type == "FARGATE" ? aws_iam_role.execution_role[0].arn : null + execution_role_arn = aws_iam_role.execution_role[0].arn requires_compatibilities = var.launch_type == "FARGATE" ? ["FARGATE"] : ["EC2"] network_mode = "awsvpc" cpu = var.launch_type == "FARGATE" ? var.fargate_task_resource_map["jobs_runner"]["cpu"] : null @@ -249,6 +249,8 @@ resource "aws_ecs_task_definition" "retool_jobs_runner" { } ] ) + + secrets = local.secrets } ] )) @@ -257,7 +259,7 @@ resource "aws_ecs_task_definition" "retool_jobs_runner" { resource "aws_ecs_task_definition" "retool" { family = "retool" task_role_arn = aws_iam_role.task_role.arn - execution_role_arn = var.launch_type == "FARGATE" ? aws_iam_role.execution_role[0].arn : null + execution_role_arn = aws_iam_role.execution_role[0].arn requires_compatibilities = var.launch_type == "FARGATE" ? ["FARGATE"] : ["EC2"] network_mode = "awsvpc" cpu = var.launch_type == "FARGATE" ? var.fargate_task_resource_map["main"]["cpu"] : null @@ -300,6 +302,8 @@ resource "aws_ecs_task_definition" "retool" { } ] ) + + secrets = local.secrets } ] )) @@ -309,7 +313,7 @@ resource "aws_ecs_task_definition" "retool_workflows_backend" { count = var.workflows_enabled ? 1 : 0 family = "retool-workflows-backend" task_role_arn = aws_iam_role.task_role.arn - execution_role_arn = var.launch_type == "FARGATE" ? aws_iam_role.execution_role[0].arn : null + execution_role_arn = aws_iam_role.execution_role[0].arn requires_compatibilities = var.launch_type == "FARGATE" ? ["FARGATE"] : ["EC2"] network_mode = "awsvpc" cpu = var.launch_type == "FARGATE" ? var.fargate_task_resource_map["workflows_backend"]["cpu"] : null @@ -352,6 +356,8 @@ resource "aws_ecs_task_definition" "retool_workflows_backend" { } ] ) + + secrets = local.secrets } ] )) @@ -361,7 +367,7 @@ resource "aws_ecs_task_definition" "retool_workflows_worker" { count = var.workflows_enabled ? 1 : 0 family = "retool-workflows-worker" task_role_arn = aws_iam_role.task_role.arn - execution_role_arn = var.launch_type == "FARGATE" ? aws_iam_role.execution_role[0].arn : null + execution_role_arn = aws_iam_role.execution_role[0].arn requires_compatibilities = var.launch_type == "FARGATE" ? ["FARGATE"] : ["EC2"] network_mode = "awsvpc" cpu = var.launch_type == "FARGATE" ? var.fargate_task_resource_map["code_executor"]["cpu"] : null @@ -408,6 +414,8 @@ resource "aws_ecs_task_definition" "retool_workflows_worker" { } ] ) + + secrets = local.secrets } ] )) @@ -417,7 +425,7 @@ resource "aws_ecs_task_definition" "retool_code_executor" { count = var.code_executor_enabled ? 1 : 0 family = "retool-code-executor" task_role_arn = aws_iam_role.task_role.arn - execution_role_arn = var.launch_type == "FARGATE" ? aws_iam_role.execution_role[0].arn : null + execution_role_arn = aws_iam_role.execution_role[0].arn requires_compatibilities = var.launch_type == "FARGATE" ? ["FARGATE"] : ["EC2"] network_mode = "awsvpc" cpu = var.launch_type == "FARGATE" ? var.fargate_task_resource_map["telemetry"]["cpu"] : null @@ -472,6 +480,8 @@ resource "aws_ecs_task_definition" "retool_code_executor" { } ] : [] ) + + secrets = local.secrets } ] )) @@ -481,7 +491,7 @@ resource "aws_ecs_task_definition" "retool_telemetry" { count = var.telemetry_enabled ? 1 : 0 family = "retool-telemetry" task_role_arn = aws_iam_role.task_role.arn - execution_role_arn = var.launch_type == "FARGATE" ? aws_iam_role.execution_role[0].arn : null + execution_role_arn = aws_iam_role.execution_role[0].arn requires_compatibilities = var.launch_type == "FARGATE" ? ["FARGATE"] : ["EC2"] network_mode = "awsvpc" cpu = var.launch_type == "FARGATE" ? var.fargate_task_resource_map["telemetry"]["cpu"] : null diff --git a/modules/aws_ecs/roles.tf b/modules/aws_ecs/roles.tf index b242bd2..2662df8 100644 --- a/modules/aws_ecs/roles.tf +++ b/modules/aws_ecs/roles.tf @@ -69,7 +69,6 @@ resource "aws_iam_role" "service_role" { } } -# Execution Role for Fargate data "aws_iam_policy_document" "execution_role_assume_policy" { statement { actions = ["sts:AssumeRole"] @@ -82,17 +81,42 @@ data "aws_iam_policy_document" "execution_role_assume_policy" { } resource "aws_iam_role" "execution_role" { - count = var.launch_type == "FARGATE" ? 1 : 0 name = "${var.deployment_name}-execution-role" assume_role_policy = data.aws_iam_policy_document.execution_role_assume_policy.json } resource "aws_iam_role_policy_attachment" "execution_role" { - count = var.launch_type == "FARGATE" ? 1 : 0 role = aws_iam_role.execution_role[0].name policy_arn = "arn:${var.iam_partition}:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy" } +data "aws_iam_policy_document" "execution_role_read_secrets" { + statement { + effect = "Allow" + + actions = [ + "secretsmanager:GetSecretValue", + ] + + resources = [ + aws_secretsmanager_secret.rds_password.arn, + aws_secretsmanager_secret.encryption_key.arn, + aws_secretsmanager_secret.jwt_secret.arn + ] + } +} + +resource aws_iam_policy "execution_role_read_secrets" { + name = "ExecutionRoleReadSecrets" + description = "Allows ECS or EC2 instance execution to read secrets block values from AWS Secret Manager" + policy = data.aws_iam_policy_document.execution_role_read_secrets.json +} + +resource "aws_iam_role_policy_attachment" "execution_role_read_secrets" { + role = aws_iam_role.execution_role[0].name + policy_arn = aws_iam_policy.execution_role_read_secrets.arn +} + # IAM Role for EC2 instances resource "aws_iam_instance_profile" "ec2" { count = var.launch_type == "EC2" ? 1 : 0 diff --git a/modules/aws_ecs/variables.tf b/modules/aws_ecs/variables.tf index 0e09be3..c48a7d9 100644 --- a/modules/aws_ecs/variables.tf +++ b/modules/aws_ecs/variables.tf @@ -514,6 +514,12 @@ variable "additional_env_vars" { description = "Additional environment variables (e.g. BASE_DOMAIN)" } +variable "additional_secrets" { + type = list(map(string)) + default = [] + description = "Optional additional environment variables set from pre-existing AWS Secrets Manager Secrets." +} + variable "additional_temporal_env_vars" { type = list(map(string)) default = [] From a3ca3b1e0764a9acc54d736e2583e4ecc6a9104c Mon Sep 17 00:00:00 2001 From: Everett Smith Date: Thu, 24 Jul 2025 20:27:48 -0700 Subject: [PATCH 6/6] Minor corrections --- modules/aws_ecs/ecs.tf | 47 +++++++++++++------------- modules/aws_ecs/main.tf | 64 +++++++++++++++++++----------------- modules/aws_ecs/roles.tf | 4 +-- modules/aws_ecs/variables.tf | 9 +++-- 4 files changed, 67 insertions(+), 57 deletions(-) diff --git a/modules/aws_ecs/ecs.tf b/modules/aws_ecs/ecs.tf index 6e473c9..80da59f 100644 --- a/modules/aws_ecs/ecs.tf +++ b/modules/aws_ecs/ecs.tf @@ -55,7 +55,7 @@ resource "aws_launch_template" "this" { # This user data represents a collection of “scripts” that will be executed the first time the machine starts. # This specific example makes sure the EC2 instance is automatically attached to the ECS cluster that we create earlier # and marks the instance as purchased through the Spot pricing - user_data = base64encod(<<-EOF + user_data = base64encode(<<-EOF #!/bin/bash echo ECS_CLUSTER=${var.deployment_name}-ecs >> /etc/ecs/ecs.config EOF @@ -67,7 +67,9 @@ resource "aws_launch_template" "this" { key_name = var.ssh_key_name # Allow the EC2 instances to access AWS resources on your behalf, using this instance profile and the permissions defined there - iam_instance_profile = aws_iam_instance_profile.ec2[0].arn + iam_instance_profile { + name = aws_iam_instance_profile.ec2[0].name + } lifecycle { create_before_destroy = true @@ -124,21 +126,20 @@ resource "aws_ecs_capacity_provider" "this" { auto_scaling_group_provider { auto_scaling_group_arn = aws_autoscaling_group.this[0].arn - } - - managed_scaling { - status = "ENABLED" - target_capacity = 80 - minimum_scaling_step_size = 1 - maximum_scaling_step_size = 2 - instance_warmup_period = 300 + managed_scaling { + status = "ENABLED" + target_capacity = 80 + minimum_scaling_step_size = 1 + maximum_scaling_step_size = 2 + instance_warmup_period = 300 + } } } resource "aws_appautoscaling_target" "retool" { - count = var.launch_type == "EC2" ? 1 : 0 + count = 1 service_namespace = "ecs" - resource_id = "service/${aws_ecs_cluster.this.name}/${var.deployment_name}-main-backend-service" + resource_id = "service/${aws_ecs_cluster.this.name}/${var.deployment_name}-main-service" scalable_dimension = "ecs:service:DesiredCount" min_capacity = 1 max_capacity = 3 @@ -146,7 +147,7 @@ resource "aws_appautoscaling_target" "retool" { } resource "aws_appautoscaling_target" "workflows_worker" { - count = (var.launch_type == "EC2" && var.workflows_enabled) ? 1 : 0 + count = var.workflows_enabled ? 1 : 0 service_namespace = "ecs" resource_id = "service/${aws_ecs_cluster.this.name}/${var.deployment_name}-workflows-worker-service" scalable_dimension = "ecs:service:DesiredCount" @@ -156,7 +157,7 @@ resource "aws_appautoscaling_target" "workflows_worker" { } resource "aws_appautoscaling_target" "workflows_backend" { - count = (var.launch_type == "EC2" && var.workflows_enabled) ? 1 : 0 + count = var.workflows_enabled ? 1 : 0 service_namespace = "ecs" resource_id = "service/${aws_ecs_cluster.this.name}/${var.deployment_name}-workflows-backend-service" scalable_dimension = "ecs:service:DesiredCount" @@ -166,7 +167,7 @@ resource "aws_appautoscaling_target" "workflows_backend" { } resource "aws_appautoscaling_target" "code_executor" { - count = (var.launch_type == "EC2" && var.code_executor_enabled) ? 1 : 0 + count = var.code_executor_enabled ? 1 : 0 service_namespace = "ecs" resource_id = "service/${aws_ecs_cluster.this.name}/${var.deployment_name}-code-executor-service" scalable_dimension = "ecs:service:DesiredCount" @@ -176,7 +177,7 @@ resource "aws_appautoscaling_target" "code_executor" { } resource "aws_appautoscaling_policy" "retool_cpu" { - count = var.launch_type == "EC2" ? 1 : 0 + count = 1 name = "retool-cpu-policy" service_namespace = "ecs" resource_id = aws_appautoscaling_target.retool[0].resource_id @@ -194,7 +195,7 @@ resource "aws_appautoscaling_policy" "retool_cpu" { } resource "aws_appautoscaling_policy" "workflows_worker_cpu" { - count = (var.launch_type == "EC2" && var.workflows_enabled) ? 1 : 0 + count = var.workflows_enabled ? 1 : 0 name = "workflows-worker-cpu-policy" service_namespace = "ecs" resource_id = aws_appautoscaling_target.workflows_worker[0].resource_id @@ -212,7 +213,7 @@ resource "aws_appautoscaling_policy" "workflows_worker_cpu" { } resource "aws_appautoscaling_policy" "workflows_backend_cpu" { - count = (var.launch_type == "EC2" && var.workflows_enabled) ? 1 : 0 + count = var.workflows_enabled ? 1 : 0 name = "workflows_backend-cpu-policy" service_namespace = "ecs" resource_id = aws_appautoscaling_target.workflows_backend[0].resource_id @@ -230,7 +231,7 @@ resource "aws_appautoscaling_policy" "workflows_backend_cpu" { } resource "aws_appautoscaling_policy" "code_executor_cpu" { - count = (var.launch_type == "EC2" && var.code_executor_enabled) ? 1 : 0 + count = var.code_executor_enabled ? 1 : 0 name = "code-executor-cpu-policy" service_namespace = "ecs" resource_id = aws_appautoscaling_target.code_executor[0].resource_id @@ -248,7 +249,7 @@ resource "aws_appautoscaling_policy" "code_executor_cpu" { } resource "aws_appautoscaling_policy" "retool_memory" { - count = var.launch_type == "EC2" ? 1 : 0 + count = 1 name = "retool-memory-policy" service_namespace = "ecs" resource_id = aws_appautoscaling_target.retool[0].resource_id @@ -266,7 +267,7 @@ resource "aws_appautoscaling_policy" "retool_memory" { } resource "aws_appautoscaling_policy" "workflows_worker_memory" { - count = (var.launch_type == "EC2" && var.workflows_enabled) ? 1 : 0 + count = var.workflows_enabled ? 1 : 0 name = "workflows-worker-memory-policy" service_namespace = "ecs" resource_id = aws_appautoscaling_target.workflows_worker[0].resource_id @@ -284,7 +285,7 @@ resource "aws_appautoscaling_policy" "workflows_worker_memory" { } resource "aws_appautoscaling_policy" "workflows_backend_memory" { - count = (var.launch_type == "EC2" && var.workflows_enabled) ? 1 : 0 + count = var.workflows_enabled ? 1 : 0 name = "workflows_backend-memory-policy" service_namespace = "ecs" resource_id = aws_appautoscaling_target.workflows_backend[0].resource_id @@ -302,7 +303,7 @@ resource "aws_appautoscaling_policy" "workflows_backend_memory" { } resource "aws_appautoscaling_policy" "code_executor_memory" { - count = (var.launch_type == "EC2" && var.code_executor_enabled) ? 1 : 0 + count = var.code_executor_enabled ? 1 : 0 name = "code-executor-memory-policy" service_namespace = "ecs" resource_id = aws_appautoscaling_target.code_executor[0].resource_id diff --git a/modules/aws_ecs/main.tf b/modules/aws_ecs/main.tf index b42bfb5..352684e 100644 --- a/modules/aws_ecs/main.tf +++ b/modules/aws_ecs/main.tf @@ -7,6 +7,11 @@ terraform { } } +provider "aws" { + profile = var.profile + region = var.aws_region +} + data "aws_vpc" "selected" { id = var.vpc_id } @@ -51,7 +56,6 @@ resource "aws_ecs_service" "retool" { name = "${var.deployment_name}-main-service" cluster = aws_ecs_cluster.this.id task_definition = aws_ecs_task_definition.retool.arn - desired_count = var.min_instance_count - 1 deployment_maximum_percent = var.maximum_percent deployment_minimum_healthy_percent = var.minimum_healthy_percent propagate_tags = var.task_propagate_tags @@ -78,13 +82,13 @@ resource "aws_ecs_service" "retool" { } resource "aws_ecs_service" "jobs_runner" { - name = "${var.deployment_name}-jobs-runner-service" - cluster = aws_ecs_cluster.this.id + name = "${var.deployment_name}-jobs-runner-service" + cluster = aws_ecs_cluster.this.id # desired_count is set to 1 since the Jobs Runner must be run as a singleton. - desired_count = 1 - task_definition = aws_ecs_task_definition.retool_jobs_runner.arn - propagate_tags = var.task_propagate_tags - enable_execute_command = var.enable_execute_command + desired_count = 1 + task_definition = aws_ecs_task_definition.retool_jobs_runner.arn + propagate_tags = var.task_propagate_tags + enable_execute_command = var.enable_execute_command deployment_minimum_healthy_percent = 0 deployment_maximum_percent = 100 @@ -103,12 +107,12 @@ resource "aws_ecs_service" "jobs_runner" { } resource "aws_ecs_service" "workflows_backend" { - count = var.workflows_enabled ? 1 : 0 - name = "${var.deployment_name}-workflows-backend-service" - cluster = aws_ecs_cluster.this.id - task_definition = aws_ecs_task_definition.retool_workflows_backend[0].arn - propagate_tags = var.task_propagate_tags - enable_execute_command = var.enable_execute_command + count = var.workflows_enabled ? 1 : 0 + name = "${var.deployment_name}-workflows-backend-service" + cluster = aws_ecs_cluster.this.id + task_definition = aws_ecs_task_definition.retool_workflows_backend[0].arn + propagate_tags = var.task_propagate_tags + enable_execute_command = var.enable_execute_command deployment_maximum_percent = var.maximum_percent deployment_minimum_healthy_percent = var.minimum_healthy_percent @@ -131,14 +135,14 @@ resource "aws_ecs_service" "workflows_backend" { } resource "aws_ecs_service" "workflows_worker" { - count = var.workflows_enabled ? 1 : 0 - name = "${var.deployment_name}-workflows-worker-service" - cluster = aws_ecs_cluster.this.id - task_definition = aws_ecs_task_definition.retool_workflows_worker[0].arn + count = var.workflows_enabled ? 1 : 0 + name = "${var.deployment_name}-workflows-worker-service" + cluster = aws_ecs_cluster.this.id + task_definition = aws_ecs_task_definition.retool_workflows_worker[0].arn deployment_maximum_percent = var.maximum_percent deployment_minimum_healthy_percent = var.minimum_healthy_percent - propagate_tags = var.task_propagate_tags - enable_execute_command = var.enable_execute_command + propagate_tags = var.task_propagate_tags + enable_execute_command = var.enable_execute_command # Need to explictly set this in aws_ecs_service to avoid destructive behavior: https://github.com/hashicorp/terraform-provider-aws/issues/22823 capacity_provider_strategy { @@ -155,11 +159,11 @@ resource "aws_ecs_service" "workflows_worker" { } resource "aws_ecs_service" "code_executor" { - count = var.code_executor_enabled ? 1 : 0 - name = "${var.deployment_name}-code-executor-service" - cluster = aws_ecs_cluster.this.id - task_definition = aws_ecs_task_definition.retool_code_executor[0].arn - enable_execute_command = var.enable_execute_command + count = var.code_executor_enabled ? 1 : 0 + name = "${var.deployment_name}-code-executor-service" + cluster = aws_ecs_cluster.this.id + task_definition = aws_ecs_task_definition.retool_code_executor[0].arn + enable_execute_command = var.enable_execute_command deployment_maximum_percent = var.maximum_percent deployment_minimum_healthy_percent = var.minimum_healthy_percent @@ -211,7 +215,7 @@ resource "aws_ecs_service" "telemetry" { resource "aws_ecs_task_definition" "retool_jobs_runner" { family = "retool-jobs-runner" task_role_arn = aws_iam_role.task_role.arn - execution_role_arn = aws_iam_role.execution_role[0].arn + execution_role_arn = aws_iam_role.execution_role.arn requires_compatibilities = var.launch_type == "FARGATE" ? ["FARGATE"] : ["EC2"] network_mode = "awsvpc" cpu = var.launch_type == "FARGATE" ? var.fargate_task_resource_map["jobs_runner"]["cpu"] : null @@ -259,7 +263,7 @@ resource "aws_ecs_task_definition" "retool_jobs_runner" { resource "aws_ecs_task_definition" "retool" { family = "retool" task_role_arn = aws_iam_role.task_role.arn - execution_role_arn = aws_iam_role.execution_role[0].arn + execution_role_arn = aws_iam_role.execution_role.arn requires_compatibilities = var.launch_type == "FARGATE" ? ["FARGATE"] : ["EC2"] network_mode = "awsvpc" cpu = var.launch_type == "FARGATE" ? var.fargate_task_resource_map["main"]["cpu"] : null @@ -313,7 +317,7 @@ resource "aws_ecs_task_definition" "retool_workflows_backend" { count = var.workflows_enabled ? 1 : 0 family = "retool-workflows-backend" task_role_arn = aws_iam_role.task_role.arn - execution_role_arn = aws_iam_role.execution_role[0].arn + execution_role_arn = aws_iam_role.execution_role.arn requires_compatibilities = var.launch_type == "FARGATE" ? ["FARGATE"] : ["EC2"] network_mode = "awsvpc" cpu = var.launch_type == "FARGATE" ? var.fargate_task_resource_map["workflows_backend"]["cpu"] : null @@ -367,7 +371,7 @@ resource "aws_ecs_task_definition" "retool_workflows_worker" { count = var.workflows_enabled ? 1 : 0 family = "retool-workflows-worker" task_role_arn = aws_iam_role.task_role.arn - execution_role_arn = aws_iam_role.execution_role[0].arn + execution_role_arn = aws_iam_role.execution_role.arn requires_compatibilities = var.launch_type == "FARGATE" ? ["FARGATE"] : ["EC2"] network_mode = "awsvpc" cpu = var.launch_type == "FARGATE" ? var.fargate_task_resource_map["code_executor"]["cpu"] : null @@ -425,7 +429,7 @@ resource "aws_ecs_task_definition" "retool_code_executor" { count = var.code_executor_enabled ? 1 : 0 family = "retool-code-executor" task_role_arn = aws_iam_role.task_role.arn - execution_role_arn = aws_iam_role.execution_role[0].arn + execution_role_arn = aws_iam_role.execution_role.arn requires_compatibilities = var.launch_type == "FARGATE" ? ["FARGATE"] : ["EC2"] network_mode = "awsvpc" cpu = var.launch_type == "FARGATE" ? var.fargate_task_resource_map["telemetry"]["cpu"] : null @@ -491,7 +495,7 @@ resource "aws_ecs_task_definition" "retool_telemetry" { count = var.telemetry_enabled ? 1 : 0 family = "retool-telemetry" task_role_arn = aws_iam_role.task_role.arn - execution_role_arn = aws_iam_role.execution_role[0].arn + execution_role_arn = aws_iam_role.execution_role.arn requires_compatibilities = var.launch_type == "FARGATE" ? ["FARGATE"] : ["EC2"] network_mode = "awsvpc" cpu = var.launch_type == "FARGATE" ? var.fargate_task_resource_map["telemetry"]["cpu"] : null diff --git a/modules/aws_ecs/roles.tf b/modules/aws_ecs/roles.tf index 2662df8..f28387b 100644 --- a/modules/aws_ecs/roles.tf +++ b/modules/aws_ecs/roles.tf @@ -86,7 +86,7 @@ resource "aws_iam_role" "execution_role" { } resource "aws_iam_role_policy_attachment" "execution_role" { - role = aws_iam_role.execution_role[0].name + role = aws_iam_role.execution_role.name policy_arn = "arn:${var.iam_partition}:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy" } @@ -113,7 +113,7 @@ resource aws_iam_policy "execution_role_read_secrets" { } resource "aws_iam_role_policy_attachment" "execution_role_read_secrets" { - role = aws_iam_role.execution_role[0].name + role = aws_iam_role.execution_role.name policy_arn = aws_iam_policy.execution_role_read_secrets.arn } diff --git a/modules/aws_ecs/variables.tf b/modules/aws_ecs/variables.tf index c48a7d9..6ac0be3 100644 --- a/modules/aws_ecs/variables.tf +++ b/modules/aws_ecs/variables.tf @@ -4,6 +4,11 @@ variable "aws_region" { description = "AWS region. Defaults to `us-east-1`" } +variable "profile" { + type = string + description = "Optional AWS CLI Profile." +} + variable "node_env" { type = string default = "production" @@ -112,7 +117,7 @@ variable "ec2_task_resource_map" { main = { cpu = 2048 memory = 4096 - memoryReservation = 3072 + memoryReservation = 4096 }, jobs_runner = { cpu = 1024 @@ -122,7 +127,7 @@ variable "ec2_task_resource_map" { workflows_backend = { cpu = 2048 memory = 4096 - memoryReservation = 3072 + memoryReservation = 4096 } workflows_worker = { cpu = 1024