Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .github/workflows/build.yml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions cdk/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
"@aws-cdk/mixins-preview": "2.238.0-alpha.0",
"@aws-sdk/client-bedrock-agentcore": "^3.1021.0",
"@aws-sdk/client-bedrock-runtime": "^3.1021.0",
"@aws-sdk/client-ecs": "^3.1021.0",
"@aws-sdk/client-dynamodb": "^3.1021.0",
"@aws-sdk/client-lambda": "^3.1021.0",
"@aws-sdk/client-secrets-manager": "^3.1021.0",
Expand Down
150 changes: 150 additions & 0 deletions cdk/src/constructs/ecs-agent-cluster.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
/**
* MIT No Attribution
*
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy of
* the Software without restriction, including without limitation the rights to
* use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
* the Software, and to permit persons to whom the Software is furnished to do so.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/

import { RemovalPolicy } from 'aws-cdk-lib';
import * as dynamodb from 'aws-cdk-lib/aws-dynamodb';
import * as ec2 from 'aws-cdk-lib/aws-ec2';
import * as ecr_assets from 'aws-cdk-lib/aws-ecr-assets';
import * as ecs from 'aws-cdk-lib/aws-ecs';
import * as iam from 'aws-cdk-lib/aws-iam';
import * as logs from 'aws-cdk-lib/aws-logs';
import * as secretsmanager from 'aws-cdk-lib/aws-secretsmanager';
import { NagSuppressions } from 'cdk-nag';
import { Construct } from 'constructs';

export interface EcsAgentClusterProps {
readonly vpc: ec2.IVpc;
readonly agentImageAsset: ecr_assets.DockerImageAsset;
readonly taskTable: dynamodb.ITable;
readonly taskEventsTable: dynamodb.ITable;
readonly userConcurrencyTable: dynamodb.ITable;
readonly githubTokenSecret: secretsmanager.ISecret;
readonly memoryId?: string;
}

export class EcsAgentCluster extends Construct {
public readonly cluster: ecs.Cluster;
public readonly taskDefinition: ecs.FargateTaskDefinition;
public readonly securityGroup: ec2.SecurityGroup;
public readonly containerName: string;

constructor(scope: Construct, id: string, props: EcsAgentClusterProps) {
super(scope, id);

this.containerName = 'AgentContainer';

// ECS Cluster with Fargate capacity provider and container insights
this.cluster = new ecs.Cluster(this, 'Cluster', {
vpc: props.vpc,
containerInsights: true,
});

// Security group — egress TCP 443 only
this.securityGroup = new ec2.SecurityGroup(this, 'TaskSG', {
vpc: props.vpc,
description: 'ECS Agent Tasks - egress TCP 443 only',
allowAllOutbound: false,
});

this.securityGroup.addEgressRule(
ec2.Peer.anyIpv4(),
ec2.Port.tcp(443),
'Allow HTTPS egress (GitHub API, AWS services)',
);

// CloudWatch log group for agent task output
const logGroup = new logs.LogGroup(this, 'TaskLogGroup', {
logGroupName: '/ecs/abca-agent-tasks',
retention: logs.RetentionDays.THREE_MONTHS,
removalPolicy: RemovalPolicy.DESTROY,
});

// Task execution role (used by ECS agent to pull images, write logs)
// CDK creates this automatically via taskDefinition, but we need to
// grant additional permissions to the task role.

// Fargate task definition
this.taskDefinition = new ecs.FargateTaskDefinition(this, 'TaskDef', {
cpu: 2048,
memoryLimitMiB: 4096,
runtimePlatform: {
cpuArchitecture: ecs.CpuArchitecture.ARM64,
operatingSystemFamily: ecs.OperatingSystemFamily.LINUX,
},
});

// Container
this.taskDefinition.addContainer(this.containerName, {
image: ecs.ContainerImage.fromDockerImageAsset(props.agentImageAsset),
logging: ecs.LogDrivers.awsLogs({
logGroup,
streamPrefix: 'agent',
}),
environment: {
CLAUDE_CODE_USE_BEDROCK: '1',
TASK_TABLE_NAME: props.taskTable.tableName,
TASK_EVENTS_TABLE_NAME: props.taskEventsTable.tableName,
USER_CONCURRENCY_TABLE_NAME: props.userConcurrencyTable.tableName,
LOG_GROUP_NAME: logGroup.logGroupName,
...(props.memoryId && { MEMORY_ID: props.memoryId }),
},
});

// Task role permissions
const taskRole = this.taskDefinition.taskRole;

// DynamoDB read/write on task tables
props.taskTable.grantReadWriteData(taskRole);
props.taskEventsTable.grantReadWriteData(taskRole);
props.userConcurrencyTable.grantReadWriteData(taskRole);

// Secrets Manager read for GitHub token
props.githubTokenSecret.grantRead(taskRole);

// Bedrock model invocation
taskRole.addToPrincipalPolicy(new iam.PolicyStatement({
actions: [
'bedrock:InvokeModel',
'bedrock:InvokeModelWithResponseStream',
],
resources: ['*'],
}));

// CloudWatch Logs write
logGroup.grantWrite(taskRole);

NagSuppressions.addResourceSuppressions(this.taskDefinition, [
{
id: 'AwsSolutions-IAM5',
reason: 'DynamoDB index/* wildcards generated by CDK grantReadWriteData; Bedrock InvokeModel requires * resource; Secrets Manager wildcards from CDK grantRead; CloudWatch Logs wildcards from CDK grantWrite',
},
{
id: 'AwsSolutions-ECS2',
reason: 'Environment variables contain table names and configuration, not secrets — GitHub token is fetched from Secrets Manager at runtime',
},
], true);

NagSuppressions.addResourceSuppressions(this.cluster, [
{
id: 'AwsSolutions-ECS4',
reason: 'Container insights is enabled via the containerInsights prop',
},
], true);
}
}
60 changes: 59 additions & 1 deletion cdk/src/constructs/task-orchestrator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,32 @@ export interface TaskOrchestratorProps {
* and writes fallback episodes during finalization.
*/
readonly memoryId?: string;

/**
* ARN of the ECS cluster for ECS compute strategy.
* When provided, ECS-related env vars and IAM policies are added.
*/
readonly ecsClusterArn?: string;

/**
* ARN of the ECS task definition for ECS compute strategy.
*/
readonly ecsTaskDefinitionArn?: string;

/**
* Comma-separated subnet IDs for ECS tasks.
*/
readonly ecsSubnets?: string;

/**
* Security group ID for ECS tasks.
*/
readonly ecsSecurityGroup?: string;

/**
* Container name in the ECS task definition.
*/
readonly ecsContainerName?: string;
}

/**
Expand Down Expand Up @@ -152,6 +178,11 @@ export class TaskOrchestrator extends Construct {
USER_PROMPT_TOKEN_BUDGET: String(props.userPromptTokenBudget),
}),
...(props.memoryId && { MEMORY_ID: props.memoryId }),
...(props.ecsClusterArn && { ECS_CLUSTER_ARN: props.ecsClusterArn }),
...(props.ecsTaskDefinitionArn && { ECS_TASK_DEFINITION_ARN: props.ecsTaskDefinitionArn }),
...(props.ecsSubnets && { ECS_SUBNETS: props.ecsSubnets }),
...(props.ecsSecurityGroup && { ECS_SECURITY_GROUP: props.ecsSecurityGroup }),
...(props.ecsContainerName && { ECS_CONTAINER_NAME: props.ecsContainerName }),
},
bundling: {
externalModules: ['@aws-sdk/*'],
Expand Down Expand Up @@ -192,6 +223,33 @@ export class TaskOrchestrator extends Construct {
resources: runtimeResources,
}));

// ECS compute strategy permissions (only when ECS is configured)
if (props.ecsClusterArn) {
this.fn.addToRolePolicy(new iam.PolicyStatement({
actions: [
'ecs:RunTask',
'ecs:DescribeTasks',
'ecs:StopTask',
],
resources: ['*'],
conditions: {
ArnEquals: {
'ecs:cluster': props.ecsClusterArn,
},
},
}));

this.fn.addToRolePolicy(new iam.PolicyStatement({
actions: ['iam:PassRole'],
resources: ['*'],
conditions: {
StringEquals: {
'iam:PassedToService': 'ecs-tasks.amazonaws.com',
},
},
}));
Comment on lines +242 to +250
Copy link

Copilot AI Apr 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The iam:PassRole permission is granted on Resource: '*' with only an iam:PassedToService condition. This is still overly broad: it allows the orchestrator to pass any IAM role to ECS tasks (potential privilege escalation if the function is ever compromised). Prefer restricting Resource to the specific task role and execution role ARNs associated with the configured ECS task definition, and pass those ARNs into TaskOrchestratorProps (or derive them if you own the task definition construct).

Copilot uses AI. Check for mistakes.
}

// Per-repo Secrets Manager grants (e.g. per-repo GitHub tokens from Blueprints)
for (const [index, secretArn] of (props.additionalSecretArns ?? []).entries()) {
const secret = secretsmanager.Secret.fromSecretCompleteArn(
Expand Down Expand Up @@ -229,7 +287,7 @@ export class TaskOrchestrator extends Construct {
},
{
id: 'AwsSolutions-IAM5',
reason: 'DynamoDB index/* wildcards generated by CDK grantReadWriteData; AgentCore runtime/* required for sub-resource invocation; Secrets Manager wildcards generated by CDK grantRead; AgentCore Memory wildcards generated by CDK grantRead/grantWrite',
reason: 'DynamoDB index/* wildcards generated by CDK grantReadWriteData; AgentCore runtime/* required for sub-resource invocation; Secrets Manager wildcards generated by CDK grantRead; AgentCore Memory wildcards generated by CDK grantRead/grantWrite; ECS RunTask/DescribeTasks/StopTask conditioned on cluster ARN; iam:PassRole conditioned on ecs-tasks.amazonaws.com',
},
], true);
}
Expand Down
68 changes: 57 additions & 11 deletions cdk/src/handlers/orchestrate-task.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@

import { withDurableExecution, type DurableExecutionHandler } from '@aws/durable-execution-sdk-js';
import { TaskStatus, TERMINAL_STATUSES } from '../constructs/task-status';
import { resolveComputeStrategy } from './shared/compute-strategy';
import { logger } from './shared/logger';
import {
admissionControl,
emitTaskEvent,
Expand All @@ -28,7 +30,7 @@ import {
loadBlueprintConfig,
loadTask,
pollTaskStatus,
startSession,
transitionTask,
type PollState,
} from './shared/orchestrator';
import { runPreflightChecks } from './shared/preflight';
Expand Down Expand Up @@ -116,27 +118,71 @@ const durableHandler: DurableExecutionHandler<OrchestrateTaskEvent, void> = asyn
}
});

// Step 4: Start agent session — invoke runtime and transition to RUNNING
await context.step('start-session', async () => {
// Step 4: Start agent session — resolve compute strategy, invoke runtime, transition to RUNNING
// Returns the full SessionHandle (serializable) so ECS polling can use it in step 5.
const sessionHandle = await context.step('start-session', async () => {
try {
return await startSession(task, payload, blueprintConfig);
const strategy = resolveComputeStrategy(blueprintConfig);
const handle = await strategy.startSession({ taskId, payload, blueprintConfig });

await transitionTask(taskId, TaskStatus.HYDRATING, TaskStatus.RUNNING, {
session_id: handle.sessionId,
started_at: new Date().toISOString(),
});
await emitTaskEvent(taskId, 'session_started', {
session_id: handle.sessionId,
strategy_type: handle.strategyType,
});

logger.info('Session started', {
task_id: taskId,
session_id: handle.sessionId,
strategy_type: handle.strategyType,
});

return handle;
} catch (err) {
await failTask(taskId, TaskStatus.HYDRATING, `Session start failed: ${String(err)}`, task.user_id, true);
throw err;
}
});

// Step 5: Wait for agent to finish
// NOTE: Polls DynamoDB every 30s rather than re-invoking the AgentCore session.
// The agent writes terminal status directly to DDB. If the agent crashes without
// writing a terminal status, we detect it via the HYDRATING early-exit check
// (MAX_NON_RUNNING_POLLS ~5min); otherwise the loop runs up to MAX_POLL_ATTEMPTS
// (~8.5h). A future improvement could add AgentCore session status checks for
// faster crash detection.
// Polls DynamoDB every 30s. For ECS compute, also polls the ECS task to detect
// container crashes that don't write terminal status to DDB. For AgentCore,
// crash detection relies on the HYDRATING early-exit (MAX_NON_RUNNING_POLLS ~5min).
const finalPollState = await context.waitForCondition<PollState>(
'await-agent-completion',
async (state) => {
return pollTaskStatus(taskId, state);
const ddbState = await pollTaskStatus(taskId, state);

// ECS compute-level crash detection: if DDB is not terminal, check ECS task status
if (
ddbState.lastStatus &&
!TERMINAL_STATUSES.includes(ddbState.lastStatus) &&
blueprintConfig.compute_type === 'ecs'
) {
try {
const strategy = resolveComputeStrategy(blueprintConfig);
const ecsStatus = await strategy.pollSession(sessionHandle);
if (ecsStatus.status === 'failed') {
const errorMsg = 'error' in ecsStatus ? ecsStatus.error : 'ECS task failed';
logger.warn('ECS task failed before DDB terminal write', {
task_id: taskId,
error: errorMsg,
});
await failTask(taskId, ddbState.lastStatus, `ECS container failed: ${errorMsg}`, task.user_id, true);
return { attempts: ddbState.attempts, lastStatus: TaskStatus.FAILED };
}
} catch (err) {
logger.warn('ECS pollSession check failed (non-fatal)', {
task_id: taskId,
error: err instanceof Error ? err.message : String(err),
});
}
}

return ddbState;
},
{
initialState: { attempts: 0 },
Expand Down
Loading
Loading