diff --git a/modules/aws_ecs/locals.tf b/modules/aws_ecs/locals.tf index a58c2bb..66c98dc 100644 --- a/modules/aws_ecs/locals.tf +++ b/modules/aws_ecs/locals.tf @@ -11,6 +11,10 @@ locals { { name = "DEPLOYMENT_TEMPLATE_TYPE" value = var.launch_type == "FARGATE" ? "aws-ecs-fargate-terraform" : "aws-ecs-ec2-terraform" + }, + { + name = "BASE_DOMAIN" + value = var.base_domain } ] @@ -27,7 +31,7 @@ locals { var.additional_env_vars, # add additional environment variables local.base_environment_variables, local.temporal_mtls_config, - var.code_executor_enabled ? [ + (var.code_executor_enabled || var.workflows_enabled || var.agents_enabled) ? [ { name = "CODE_EXECUTOR_INGRESS_DOMAIN" value = format("http://code-executor.%s:3004", local.service_discovery_namespace) @@ -123,7 +127,7 @@ locals { auto_create_group = "true" log_stream_prefix = "SERVICE_RETOOL/" } - } : { + } : { logDriver = "awslogs" options = { awslogs-group = aws_cloudwatch_log_group.this.id @@ -143,7 +147,7 @@ locals { memory = var.launch_type == "EC2" ? var.ecs_task_resource_map["fluentbit"]["memory"] : null firelensConfiguration = { - type = "fluentbit" + type = "fluentbit" options = { config-file-type = "file" config-file-value = "/extra.conf" @@ -152,7 +156,7 @@ locals { logConfiguration = { logDriver = "awslogs" - options = { + options = { awslogs-group = aws_cloudwatch_log_group.this.id awslogs-region = var.aws_region awslogs-stream-prefix = "SERVICE_RETOOL" diff --git a/modules/aws_ecs/main.tf b/modules/aws_ecs/main.tf index 800e0f6..9e9060c 100644 --- a/modules/aws_ecs/main.tf +++ b/modules/aws_ecs/main.tf @@ -113,7 +113,7 @@ resource "aws_ecs_service" "jobs_runner" { } resource "aws_ecs_service" "workflows_backend" { - count = var.workflows_enabled ? 1 : 0 + count = (var.workflows_enabled || var.agents_enabled) ? 1 : 0 name = "${var.deployment_name}-workflows-backend-service" cluster = aws_ecs_cluster.this.id desired_count = 1 @@ -146,7 +146,7 @@ resource "aws_ecs_service" "workflows_backend" { } resource "aws_ecs_service" "workflows_worker" { - count = var.workflows_enabled ? 1 : 0 + count = (var.workflows_enabled || var.agents_enabled) ? 1 : 0 name = "${var.deployment_name}-workflows-worker-service" cluster = aws_ecs_cluster.this.id desired_count = 1 @@ -175,7 +175,7 @@ resource "aws_ecs_service" "workflows_worker" { } resource "aws_ecs_service" "code_executor" { - count = var.code_executor_enabled ? 1 : 0 + count = (var.code_executor_enabled || var.workflows_enabled || var.agents_enabled) ? 1 : 0 name = "${var.deployment_name}-code-executor-service" cluster = aws_ecs_cluster.this.id desired_count = 1 @@ -239,6 +239,64 @@ resource "aws_ecs_service" "telemetry" { } } +resource "aws_ecs_service" "agent_worker" { + count = var.agents_enabled ? 1 : 0 + name = "${var.deployment_name}-agent-worker-service" + cluster = aws_ecs_cluster.this.id + desired_count = 1 + task_definition = aws_ecs_task_definition.retool_agent_worker[0].arn + propagate_tags = var.task_propagate_tags + enable_execute_command = var.enable_execute_command + + # Need to explictly set this in aws_ecs_service to avoid destructive behavior: https://github.com/hashicorp/terraform-provider-aws/issues/22823 + capacity_provider_strategy { + base = 1 + weight = 100 + capacity_provider = var.launch_type == "FARGATE" ? "FARGATE" : aws_ecs_capacity_provider.this[0].name + } + + dynamic "network_configuration" { + for_each = var.launch_type == "FARGATE" ? toset([1]) : toset([]) + + content { + subnets = var.private_subnet_ids + security_groups = [ + aws_security_group.containers.id + ] + assign_public_ip = true + } + } +} + +resource "aws_ecs_service" "agent_eval_worker" { + count = var.agents_enabled ? 1 : 0 + name = "${var.deployment_name}-agent-eval-worker-service" + cluster = aws_ecs_cluster.this.id + desired_count = 1 + task_definition = aws_ecs_task_definition.retool_agent_eval_worker[0].arn + propagate_tags = var.task_propagate_tags + enable_execute_command = var.enable_execute_command + + # Need to explictly set this in aws_ecs_service to avoid destructive behavior: https://github.com/hashicorp/terraform-provider-aws/issues/22823 + capacity_provider_strategy { + base = 1 + weight = 100 + capacity_provider = var.launch_type == "FARGATE" ? "FARGATE" : aws_ecs_capacity_provider.this[0].name + } + + dynamic "network_configuration" { + for_each = var.launch_type == "FARGATE" ? toset([1]) : toset([]) + + content { + subnets = var.private_subnet_ids + security_groups = [ + aws_security_group.containers.id + ] + assign_public_ip = true + } + } +} + resource "aws_ecs_task_definition" "retool_jobs_runner" { family = "retool-jobs-runner" task_role_arn = aws_iam_role.task_role.arn @@ -335,7 +393,7 @@ resource "aws_ecs_task_definition" "retool" { } resource "aws_ecs_task_definition" "retool_workflows_backend" { - count = var.workflows_enabled ? 1 : 0 + count = (var.workflows_enabled || var.agents_enabled) ? 1 : 0 family = "retool-workflows-backend" task_role_arn = aws_iam_role.task_role.arn execution_role_arn = var.launch_type == "FARGATE" ? aws_iam_role.execution_role[0].arn : null @@ -386,7 +444,7 @@ resource "aws_ecs_task_definition" "retool_workflows_backend" { } resource "aws_ecs_task_definition" "retool_workflows_worker" { - count = var.workflows_enabled ? 1 : 0 + count = (var.workflows_enabled || var.agents_enabled) ? 1 : 0 family = "retool-workflows-worker" task_role_arn = aws_iam_role.task_role.arn execution_role_arn = var.launch_type == "FARGATE" ? aws_iam_role.execution_role[0].arn : null @@ -440,8 +498,126 @@ resource "aws_ecs_task_definition" "retool_workflows_worker" { )) } +resource "aws_ecs_task_definition" "retool_agent_worker" { + count = var.agents_enabled ? 1 : 0 + family = "retool-agent-worker" + task_role_arn = aws_iam_role.task_role.arn + execution_role_arn = var.launch_type == "FARGATE" ? aws_iam_role.execution_role[0].arn : null + requires_compatibilities = var.launch_type == "FARGATE" ? ["FARGATE"] : null + network_mode = var.launch_type == "FARGATE" ? "awsvpc" : "bridge" + cpu = var.launch_type == "FARGATE" ? var.ecs_task_resource_map["agent_worker"]["cpu"] : null + memory = var.launch_type == "FARGATE" ? var.ecs_task_resource_map["agent_worker"]["memory"] : null + + container_definitions = jsonencode(concat( + local.common_containers, + [ + { + name = "retool-agent-worker" + essential = true + image = var.ecs_retool_image + cpu = var.launch_type == "EC2" ? var.ecs_task_resource_map["agent_worker"]["cpu"] : null + memory = var.launch_type == "EC2" ? var.ecs_task_resource_map["agent_worker"]["memory"] : null + command = [ + "./docker_scripts/start_api.sh" + ] + + logConfiguration = local.task_log_configuration + + portMappings = [ + { + containerPort = 3005 + hostPort = 3005 + protocol = "tcp" + } + ] + + environment = concat( + local.environment_variables, + [ + { + name = "SERVICE_TYPE" + value = "WORKFLOW_TEMPORAL_WORKER" + }, + { + name = "WORKER_TEMPORAL_TASKQUEUE" + value = "agent" + }, + { + name = "NODE_OPTIONS" + value = "--max_old_space_size=1024" + }, + { + "name" = "COOKIE_INSECURE", + "value" = tostring(var.cookie_insecure) + } + ] + ) + } + ] + )) +} + +resource "aws_ecs_task_definition" "retool_agent_eval_worker" { + count = var.agents_enabled ? 1 : 0 + family = "retool-agent-eval-worker" + task_role_arn = aws_iam_role.task_role.arn + execution_role_arn = var.launch_type == "FARGATE" ? aws_iam_role.execution_role[0].arn : null + requires_compatibilities = var.launch_type == "FARGATE" ? ["FARGATE"] : null + network_mode = var.launch_type == "FARGATE" ? "awsvpc" : "bridge" + cpu = var.launch_type == "FARGATE" ? var.ecs_task_resource_map["agent_eval_worker"]["cpu"] : null + memory = var.launch_type == "FARGATE" ? var.ecs_task_resource_map["agent_eval_worker"]["memory"] : null + + container_definitions = jsonencode(concat( + local.common_containers, + [ + { + name = "retool-agent-eval-worker" + essential = true + image = var.ecs_retool_image + cpu = var.launch_type == "EC2" ? var.ecs_task_resource_map["agent_eval_worker"]["cpu"] : null + memory = var.launch_type == "EC2" ? var.ecs_task_resource_map["agent_eval_worker"]["memory"] : null + command = [ + "./docker_scripts/start_api.sh" + ] + + logConfiguration = local.task_log_configuration + + portMappings = [ + { + containerPort = 3005 + hostPort = 3005 + protocol = "tcp" + } + ] + + environment = concat( + local.environment_variables, + [ + { + name = "SERVICE_TYPE" + value = "AGENT_EVAL_TEMPORAL_WORKER" + }, + { + name = "WORKER_TEMPORAL_TASKQUEUE" + value = "agent-eval" + }, + { + name = "NODE_OPTIONS" + value = "--max_old_space_size=1024" + }, + { + "name" = "COOKIE_INSECURE", + "value" = tostring(var.cookie_insecure) + } + ] + ) + } + ] + )) +} + resource "aws_ecs_task_definition" "retool_code_executor" { - count = var.code_executor_enabled ? 1 : 0 + count = (var.code_executor_enabled || var.workflows_enabled || var.agents_enabled) ? 1 : 0 family = "retool-code-executor" task_role_arn = aws_iam_role.task_role.arn execution_role_arn = var.launch_type == "FARGATE" ? aws_iam_role.execution_role[0].arn : null @@ -454,12 +630,12 @@ resource "aws_ecs_task_definition" "retool_code_executor" { local.common_containers, [ { - name = "retool-code-executor" - essential = true - image = local.ecs_code_executor_image - cpu = var.launch_type == "EC2" ? var.ecs_task_resource_map["code_executor"]["cpu"] : null - memory = var.launch_type == "EC2" ? var.ecs_task_resource_map["code_executor"]["memory"] : null - user = var.launch_type == "EC2" ? null : "1001:1001" + name = "retool-code-executor" + essential = true + image = local.ecs_code_executor_image + cpu = var.launch_type == "EC2" ? var.ecs_task_resource_map["code_executor"]["cpu"] : null + memory = var.launch_type == "EC2" ? var.ecs_task_resource_map["code_executor"]["memory"] : null + user = var.launch_type == "EC2" ? null : "1001:1001" # required to use nsjail sandboxing, which is required for custom libraries for JS and Python # Learn more here: https://docs.retool.com/self-hosted/concepts/architecture#code-executor # If not using nsjail sandboxing, update this to be false and use user = "1001:1001" @@ -486,7 +662,7 @@ resource "aws_ecs_task_definition" "retool_code_executor" { local.base_environment_variables, [ { - name = "NODE_OPTIONS", + name = "NODE_OPTIONS", value = "--max_old_space_size=1024" } ], @@ -587,14 +763,14 @@ resource "aws_ecs_task_definition" "retool_telemetry" { } resource "aws_service_discovery_private_dns_namespace" "retool_namespace" { - count = (var.code_executor_enabled || var.telemetry_enabled || var.workflows_enabled) ? 1 : 0 + count = (var.code_executor_enabled || var.telemetry_enabled || var.workflows_enabled || var.agents_enabled) ? 1 : 0 name = local.service_discovery_namespace description = "Service Discovery namespace for Retool deployment" vpc = var.vpc_id } resource "aws_service_discovery_service" "retool_workflow_backend_service" { - count = var.workflows_enabled ? 1 : 0 + count = (var.workflows_enabled || var.agents_enabled) ? 1 : 0 name = "workflow-backend" dns_config { @@ -614,7 +790,7 @@ resource "aws_service_discovery_service" "retool_workflow_backend_service" { } resource "aws_service_discovery_service" "retool_code_executor_service" { - count = var.code_executor_enabled ? 1 : 0 + count = (var.code_executor_enabled || var.workflows_enabled || var.agents_enabled) ? 1 : 0 name = "code-executor" dns_config { @@ -635,7 +811,7 @@ resource "aws_service_discovery_service" "retool_code_executor_service" { resource "aws_service_discovery_service" "retool_telemetry_service" { count = var.telemetry_enabled ? 1 : 0 - name = "telemetry" + name = "telemetry" dns_config { namespace_id = aws_service_discovery_private_dns_namespace.retool_namespace[0].id @@ -654,7 +830,7 @@ resource "aws_service_discovery_service" "retool_telemetry_service" { } module "temporal" { - count = var.workflows_enabled && !var.use_existing_temporal_cluster ? 1 : 0 + count = (var.workflows_enabled || var.agents_enabled) && !var.use_existing_temporal_cluster ? 1 : 0 source = "./temporal" deployment_name = "${var.deployment_name}-temporal" vpc_id = var.vpc_id diff --git a/modules/aws_ecs/variables.tf b/modules/aws_ecs/variables.tf index 796131a..e7199be 100644 --- a/modules/aws_ecs/variables.tf +++ b/modules/aws_ecs/variables.tf @@ -73,16 +73,21 @@ variable "retool_license_key" { default = "EXPIRED-LICENSE-KEY-TRIAL" } +variable "base_domain" { + type = string + description = "Base domain URL for the deployment (e.g., https://retool.example.com or http://retool-alb-123.region.elb.amazonaws.com). Required by Retool backend at startup." +} + variable "ecs_retool_image" { type = string - description = "Container image for desired Retool version. Defaults to `3.114.2-stable`" - default = "tryretool/backend:3.114.2-stable" + description = "Container image for desired Retool version. Defaults to `3.253.8-stable`" + default = "tryretool/backend:3.253.8-stable" } variable "ecs_code_executor_image" { type = string - description = "Container image for desired code_executor version. Defaults to `3.114.2-stable`" - default = "tryretool/code-executor-service:3.114.2-stable" + description = "Container image for desired code_executor version. Defaults to ecs_retool_image if not specified." + default = "" } variable "ecs_telemetry_image" { @@ -123,6 +128,14 @@ variable "ecs_task_resource_map" { cpu = 2048 memory = 4096 } + agent_worker = { + cpu = 2048 + memory = 4096 + } + agent_eval_worker = { + cpu = 2048 + memory = 4096 + } telemetry = { cpu = 1024 memory = 2048 @@ -364,7 +377,7 @@ variable "temporal_aurora_instances" { variable "temporal_image" { type = string description = "Docker image for Temporal" - default = "tryretool/one-offs:retool-temporal-1.1.5" + default = "tryretool/one-offs:retool-temporal-1.1.6" } variable "workflows_enabled" { @@ -379,6 +392,12 @@ variable "code_executor_enabled" { description = "Whether to enable code_executor service to support Python execution. Defaults to false." } +variable "agents_enabled" { + type = bool + default = false + description = "Whether to enable agents. Defaults to false." +} + variable "telemetry_enabled" { type = bool default = false