From 4d647deceaa3144fcb59985f57da5624f74f5174 Mon Sep 17 00:00:00 2001 From: krokoko Date: Thu, 9 Apr 2026 14:34:05 -0500 Subject: [PATCH] chore(guardrails): address review comments --- README.md | 4 +- cdk/src/constructs/task-orchestrator.ts | 37 ++++- cdk/src/handlers/orchestrate-task.ts | 4 +- cdk/src/handlers/shared/context-hydration.ts | 101 +++++++++++- cdk/src/handlers/shared/create-task-core.ts | 16 +- cdk/src/handlers/shared/orchestrator.ts | 20 +++ cdk/src/stacks/agent.ts | 2 + cdk/test/constructs/task-orchestrator.test.ts | 76 ++++++++- cdk/test/handlers/orchestrate-task.test.ts | 52 ++++++ .../handlers/shared/context-hydration.test.ts | 153 ++++++++++++++++++ .../handlers/shared/create-task-core.test.ts | 5 +- docs/design/API_CONTRACT.md | 7 +- docs/design/ARCHITECTURE.md | 3 +- docs/design/INPUT_GATEWAY.md | 2 +- docs/design/OBSERVABILITY.md | 5 + docs/design/ORCHESTRATOR.md | 12 +- docs/design/SECURITY.md | 12 +- docs/guides/DEVELOPER_GUIDE.md | 2 +- docs/guides/ROADMAP.md | 7 +- docs/guides/USER_GUIDE.md | 7 +- docs/src/content/docs/design/Api-contract.md | 7 +- docs/src/content/docs/design/Architecture.md | 3 +- docs/src/content/docs/design/Input-gateway.md | 2 +- docs/src/content/docs/design/Observability.md | 5 + docs/src/content/docs/design/Orchestrator.md | 12 +- docs/src/content/docs/design/Security.md | 12 +- .../docs/developer-guide/Project-structure.md | 2 +- docs/src/content/docs/roadmap/Roadmap.md | 7 +- .../content/docs/user-guide/Task-lifecycle.md | 4 +- docs/src/content/docs/user-guide/Tips.md | 1 + .../docs/user-guide/Using-the-rest-api.md | 2 + 31 files changed, 534 insertions(+), 50 deletions(-) diff --git a/README.md b/README.md index 4e80c7f..a7301b7 100644 --- a/README.md +++ b/README.md @@ -59,10 +59,10 @@ ABCA is under active development. The platform ships iteratively — each iterat | **3a** | Done | Repo onboarding, per-repo GitHub App credentials, turn caps, prompt guide | | **3b** | Done | Memory Tier 1, insights, agent self-feedback, prompt versioning, commit attribution | | **3bis** | Done | Hardening — reconciler error tracking, error serialization, test coverage gaps | -| **3c** | WIP | Pre-flight checks, persistent session storage, deterministic validation, PR review task type, multi-modal input | +| **3c** | WIP | Pre-flight checks, persistent session storage, deterministic validation, PR review task type, multi-modal input, input guardrail screening | | **3d** | Planned | Review feedback loop, PR outcome tracking, evaluation pipeline | | **4** | Planned | GitLab, visual proof, Slack, control panel, WebSocket streaming | -| **5** | Planned | Pre-warming, multi-user/team, cost management, guardrails, alternate runtime | +| **5** | Planned | Pre-warming, multi-user/team, cost management, output guardrails, alternate runtime | | **6** | Planned | Skills learning, multi-repo, iterative feedback, multiplayer, CDK constructs | See the full [ROADMAP](./docs/guides/ROADMAP.md) for details on each iteration. diff --git a/cdk/src/constructs/task-orchestrator.ts b/cdk/src/constructs/task-orchestrator.ts index 34f5944..7da3030 100644 --- a/cdk/src/constructs/task-orchestrator.ts +++ b/cdk/src/constructs/task-orchestrator.ts @@ -18,7 +18,7 @@ */ import * as path from 'path'; -import { Duration } from 'aws-cdk-lib'; +import { Duration, Stack } from 'aws-cdk-lib'; import * as cloudwatch from 'aws-cdk-lib/aws-cloudwatch'; import * as dynamodb from 'aws-cdk-lib/aws-dynamodb'; import * as iam from 'aws-cdk-lib/aws-iam'; @@ -100,6 +100,18 @@ export interface TaskOrchestratorProps { * and writes fallback episodes during finalization. */ readonly memoryId?: string; + + /** + * Bedrock Guardrail ID used by the orchestrator to screen assembled PR prompts + * for prompt injection during context hydration. The same guardrail is also + * used by the Task API for submission-time task description screening. + */ + readonly guardrailId?: string; + + /** + * Bedrock Guardrail version. Required when guardrailId is provided. + */ + readonly guardrailVersion?: string; } /** @@ -125,6 +137,13 @@ export class TaskOrchestrator extends Construct { constructor(scope: Construct, id: string, props: TaskOrchestratorProps) { super(scope, id); + if (props.guardrailId && !props.guardrailVersion) { + throw new Error('guardrailVersion is required when guardrailId is provided'); + } + if (!props.guardrailId && props.guardrailVersion) { + throw new Error('guardrailId is required when guardrailVersion is provided'); + } + const handlersDir = path.join(__dirname, '..', 'handlers'); const maxConcurrent = props.maxConcurrentTasksPerUser ?? 3; @@ -152,6 +171,8 @@ export class TaskOrchestrator extends Construct { USER_PROMPT_TOKEN_BUDGET: String(props.userPromptTokenBudget), }), ...(props.memoryId && { MEMORY_ID: props.memoryId }), + ...(props.guardrailId && { GUARDRAIL_ID: props.guardrailId }), + ...(props.guardrailVersion && { GUARDRAIL_VERSION: props.guardrailVersion }), }, bundling: { externalModules: ['@aws-sdk/*'], @@ -200,6 +221,20 @@ export class TaskOrchestrator extends Construct { secret.grantRead(this.fn); } + // Bedrock Guardrail permissions + if (props.guardrailId) { + this.fn.addToRolePolicy(new iam.PolicyStatement({ + actions: ['bedrock:ApplyGuardrail'], + resources: [ + Stack.of(this).formatArn({ + service: 'bedrock', + resource: 'guardrail', + resourceName: props.guardrailId, + }), + ], + })); + } + // Create alias for durable function invocation const fnAlias = this.fn.currentVersion.addAlias('live'); this.alias = fnAlias; diff --git a/cdk/src/handlers/orchestrate-task.ts b/cdk/src/handlers/orchestrate-task.ts index a93b8e8..a4ae206 100644 --- a/cdk/src/handlers/orchestrate-task.ts +++ b/cdk/src/handlers/orchestrate-task.ts @@ -110,8 +110,8 @@ const durableHandler: DurableExecutionHandler = asyn try { return await hydrateAndTransition(task, blueprintConfig); } catch (err) { - // Transition may fail if task was externally cancelled — release concurrency - await failTask(taskId, task.status, `Hydration failed: ${String(err)}`, task.user_id, true); + // Hydration may fail due to external cancellation, guardrail blocking, or guardrail API failure — fail the task and release concurrency + await failTask(taskId, TaskStatus.HYDRATING, `Hydration failed: ${String(err)}`, task.user_id, true); throw err; } }); diff --git a/cdk/src/handlers/shared/context-hydration.ts b/cdk/src/handlers/shared/context-hydration.ts index 950c391..49ba1c5 100644 --- a/cdk/src/handlers/shared/context-hydration.ts +++ b/cdk/src/handlers/shared/context-hydration.ts @@ -17,6 +17,7 @@ * SOFTWARE. */ +import { ApplyGuardrailCommand, BedrockRuntimeClient } from '@aws-sdk/client-bedrock-runtime'; import { GetSecretValueCommand, SecretsManagerClient } from '@aws-sdk/client-secrets-manager'; import { logger } from './logger'; import { loadMemoryContext, type MemoryContext } from './memory'; @@ -85,6 +86,7 @@ export interface HydratedContext { readonly token_estimate: number; readonly truncated: boolean; readonly fallback_error?: string; + readonly guardrail_blocked?: string; readonly resolved_branch_name?: string; readonly resolved_base_branch?: string; } @@ -96,6 +98,81 @@ export interface HydratedContext { const GITHUB_TOKEN_SECRET_ARN = process.env.GITHUB_TOKEN_SECRET_ARN; const USER_PROMPT_TOKEN_BUDGET = Number(process.env.USER_PROMPT_TOKEN_BUDGET ?? '100000'); const GITHUB_API_TIMEOUT_MS = 30_000; +const GUARDRAIL_ID = process.env.GUARDRAIL_ID; +const GUARDRAIL_VERSION = process.env.GUARDRAIL_VERSION; +const bedrockClient = (GUARDRAIL_ID && GUARDRAIL_VERSION) ? new BedrockRuntimeClient({}) : undefined; +if (GUARDRAIL_ID && !GUARDRAIL_VERSION) { + logger.error('GUARDRAIL_ID is set but GUARDRAIL_VERSION is missing — guardrail screening disabled', { + metric_type: 'guardrail_misconfiguration', + }); +} + +// --------------------------------------------------------------------------- +// Bedrock Guardrail screening +// --------------------------------------------------------------------------- + +/** + * Error thrown when the Bedrock Guardrail API call fails. Distinguished from + * other errors so the outer catch in hydrateContext can re-throw it instead of + * falling back to unscreened content (fail-closed). + */ +export class GuardrailScreeningError extends Error { + constructor(message: string, cause?: Error) { + super(message, cause ? { cause } : undefined); + this.name = 'GuardrailScreeningError'; + } +} + +/** + * Screen text through the Bedrock Guardrail for prompt injection detection. + * Fail-closed: throws on Bedrock errors so unscreened content never reaches the agent. + * @param text - the text to screen. + * @param taskId - the task ID (for logging). + * @returns 'GUARDRAIL_INTERVENED' if blocked, 'NONE' if allowed, undefined when guardrail is + * not configured (env vars missing). + * @throws GuardrailScreeningError when the Bedrock Guardrail API call fails (fail-closed). + */ +export async function screenWithGuardrail(text: string, taskId: string): Promise<'GUARDRAIL_INTERVENED' | 'NONE' | undefined> { + if (!bedrockClient || !GUARDRAIL_ID || !GUARDRAIL_VERSION) { + logger.info('Guardrail screening skipped — guardrail not configured', { + task_id: taskId, + metric_type: 'guardrail_screening_skipped', + }); + return undefined; + } + + try { + const result = await bedrockClient.send(new ApplyGuardrailCommand({ + guardrailIdentifier: GUARDRAIL_ID, + guardrailVersion: GUARDRAIL_VERSION, + source: 'INPUT', + content: [{ text: { text } }], + })); + + if (result.action === 'GUARDRAIL_INTERVENED') { + logger.warn('Content blocked by guardrail', { + task_id: taskId, + guardrail_id: GUARDRAIL_ID, + guardrail_version: GUARDRAIL_VERSION, + }); + return 'GUARDRAIL_INTERVENED'; + } + + return 'NONE'; + } catch (err) { + logger.error('Guardrail screening failed (fail-closed)', { + task_id: taskId, + guardrail_id: GUARDRAIL_ID, + error: err instanceof Error ? err.message : String(err), + error_name: err instanceof Error ? err.name : undefined, + metric_type: 'guardrail_screening_failure', + }); + throw new GuardrailScreeningError( + `Guardrail screening unavailable: ${err instanceof Error ? err.message : String(err)}`, + err instanceof Error ? err : undefined, + ); + } +} // --------------------------------------------------------------------------- // GitHub token resolution (Secrets Manager with caching) @@ -715,11 +792,15 @@ export interface HydrateContextOptions { } /** - * Hydrate context for a task: resolve GitHub token, fetch issue, enforce - * token budget, and assemble the user prompt. + * Hydrate context for a task: resolve GitHub token, fetch issue/PR, enforce + * token budget, assemble the user prompt, and (for PR tasks) screen through + * Bedrock Guardrail for prompt injection. * @param task - the task record from DynamoDB. * @param options - optional per-repo overrides. - * @returns the hydrated context. + * @returns the hydrated context. For PR tasks, `guardrail_blocked` is set when + * the guardrail intervened. + * @throws GuardrailScreeningError when the Bedrock Guardrail API call fails + * (fail-closed — propagated to prevent unscreened content from reaching the agent). */ export async function hydrateContext(task: TaskRecord, options?: HydrateContextOptions): Promise { const sources: string[] = []; @@ -889,7 +970,10 @@ export async function hydrateContext(task: TaskRecord, options?: HydrateContextO resolvedBranchName = prResult.head_ref; resolvedBaseBranch = prResult.base_ref; - return { + // Screen assembled PR prompt through Bedrock Guardrail for prompt injection + const guardrailAction = await screenWithGuardrail(userPrompt, task.task_id); + + const prContext: HydratedContext = { version: 1, user_prompt: userPrompt, memory_context: memoryContext, @@ -898,7 +982,12 @@ export async function hydrateContext(task: TaskRecord, options?: HydrateContextO sources, token_estimate: estimateTokens(userPrompt), truncated, + ...(guardrailAction === 'GUARDRAIL_INTERVENED' && { + guardrail_blocked: 'PR context blocked by content policy', + }), }; + + return prContext; } // Standard task: existing behavior @@ -918,6 +1007,10 @@ export async function hydrateContext(task: TaskRecord, options?: HydrateContextO truncated: budgetResult.truncated, }; } catch (err) { + // Guardrail failures must propagate (fail-closed) — unscreened content must not reach the agent + if (err instanceof GuardrailScreeningError) { + throw err; + } // Fallback: minimal context from task_description only logger.error('Unexpected error during context hydration', { task_id: task.task_id, error: err instanceof Error ? err.message : String(err), diff --git a/cdk/src/handlers/shared/create-task-core.ts b/cdk/src/handlers/shared/create-task-core.ts index f0f1b3d..5f003c4 100644 --- a/cdk/src/handlers/shared/create-task-core.ts +++ b/cdk/src/handlers/shared/create-task-core.ts @@ -46,7 +46,13 @@ export interface TaskCreationContext { const ddb = DynamoDBDocumentClient.from(new DynamoDBClient({})); const lambdaClient = process.env.ORCHESTRATOR_FUNCTION_ARN ? new LambdaClient({}) : undefined; -const bedrockClient = process.env.GUARDRAIL_ID ? new BedrockRuntimeClient({}) : undefined; +const bedrockClient = (process.env.GUARDRAIL_ID && process.env.GUARDRAIL_VERSION) + ? new BedrockRuntimeClient({}) : undefined; +if (process.env.GUARDRAIL_ID && !process.env.GUARDRAIL_VERSION) { + logger.error('GUARDRAIL_ID is set but GUARDRAIL_VERSION is missing — guardrail screening disabled', { + metric_type: 'guardrail_misconfiguration', + }); +} const TABLE_NAME = process.env.TASK_TABLE_NAME!; const EVENTS_TABLE_NAME = process.env.TASK_EVENTS_TABLE_NAME!; const TASK_RETENTION_DAYS = Number(process.env.TASK_RETENTION_DAYS ?? '90'); @@ -117,8 +123,8 @@ export async function createTaskCore( } const userMaxBudgetUsd = maxBudgetResult; - // 2. Screen task description with Bedrock Guardrail (fail-open: a Bedrock outage - // should not block all task submissions — log the error and proceed) + // 2. Screen task description with Bedrock Guardrail (fail-closed: unscreened content + // must not reach the agent — a Bedrock outage blocks task submissions) if (bedrockClient && body.task_description) { try { const guardrailResult = await bedrockClient.send(new ApplyGuardrailCommand({ @@ -133,11 +139,13 @@ export async function createTaskCore( return errorResponse(400, ErrorCode.VALIDATION_ERROR, 'Task description was blocked by content policy.', requestId); } } catch (guardrailErr) { - logger.error('Guardrail screening failed — proceeding without screening (fail-open)', { + logger.error('Guardrail screening failed (fail-closed)', { error: String(guardrailErr), user_id: context.userId, request_id: requestId, + metric_type: 'guardrail_screening_failure', }); + return errorResponse(503, ErrorCode.INTERNAL_ERROR, 'Content screening is temporarily unavailable. Please try again later.', requestId); } } diff --git a/cdk/src/handlers/shared/orchestrator.ts b/cdk/src/handlers/shared/orchestrator.ts index f06b6e0..0c2e9d2 100644 --- a/cdk/src/handlers/shared/orchestrator.ts +++ b/cdk/src/handlers/shared/orchestrator.ts @@ -252,6 +252,26 @@ export async function hydrateAndTransition(task: TaskRecord, blueprintConfig?: B memoryId: MEMORY_ID, }); + // If guardrail screening blocked the hydrated context, emit audit event and throw + // to trigger task failure (the caller in orchestrate-task.ts catches and transitions to FAILED) + if (hydratedContext.guardrail_blocked) { + try { + await emitTaskEvent(task.task_id, 'guardrail_blocked', { + reason: hydratedContext.guardrail_blocked, + task_type: task.task_type, + pr_number: task.pr_number, + sources: hydratedContext.sources, + token_estimate: hydratedContext.token_estimate, + }); + } catch (eventErr) { + logger.error('Failed to emit guardrail_blocked event', { + task_id: task.task_id, + error: eventErr instanceof Error ? eventErr.message : String(eventErr), + }); + } + throw new Error(`Guardrail blocked: ${hydratedContext.guardrail_blocked}`); + } + // For PR iteration: resolve actual branch name from PR head_ref if (hydratedContext.resolved_branch_name) { try { diff --git a/cdk/src/stacks/agent.ts b/cdk/src/stacks/agent.ts index f351a61..afb81f5 100644 --- a/cdk/src/stacks/agent.ts +++ b/cdk/src/stacks/agent.ts @@ -282,6 +282,8 @@ export class AgentStack extends Stack { runtimeArn: runtime.agentRuntimeArn, githubTokenSecretArn: githubTokenSecret.secretArn, memoryId: agentMemory.memory.memoryId, + guardrailId: inputGuardrail.guardrailId, + guardrailVersion: inputGuardrail.guardrailVersion, }); // Grant the orchestrator Lambda read+write access to memory diff --git a/cdk/test/constructs/task-orchestrator.test.ts b/cdk/test/constructs/task-orchestrator.test.ts index 7748b1d..0b2c2ef 100644 --- a/cdk/test/constructs/task-orchestrator.test.ts +++ b/cdk/test/constructs/task-orchestrator.test.ts @@ -30,6 +30,8 @@ interface StackOverrides { additionalRuntimeArns?: string[]; additionalSecretArns?: string[]; memoryId?: string; + guardrailId?: string; + guardrailVersion?: string; } function createStack(overrides?: StackOverrides): { stack: Stack; template: Template } { @@ -55,7 +57,7 @@ function createStack(overrides?: StackOverrides): { stack: Stack; template: Temp }) : undefined; - const { includeRepoTable: _, additionalRuntimeArns, additionalSecretArns, memoryId, ...rest } = overrides ?? {}; + const { includeRepoTable: _, additionalRuntimeArns, additionalSecretArns, memoryId, guardrailId, guardrailVersion, ...rest } = overrides ?? {}; new TaskOrchestrator(stack, 'TaskOrchestrator', { taskTable, @@ -66,6 +68,8 @@ function createStack(overrides?: StackOverrides): { stack: Stack; template: Temp ...(additionalRuntimeArns && { additionalRuntimeArns }), ...(additionalSecretArns && { additionalSecretArns }), ...(memoryId && { memoryId }), + ...(guardrailId && { guardrailId }), + ...(guardrailVersion && { guardrailVersion }), ...rest, }); @@ -341,4 +345,74 @@ describe('TaskOrchestrator construct', () => { MaximumRetryAttempts: 0, }); }); + + test('includes GUARDRAIL_ID and GUARDRAIL_VERSION when provided', () => { + const { template } = createStack({ guardrailId: 'gr-test-123', guardrailVersion: '1' }); + template.hasResourceProperties('AWS::Lambda::Function', { + Environment: { + Variables: Match.objectLike({ + GUARDRAIL_ID: 'gr-test-123', + GUARDRAIL_VERSION: '1', + }), + }, + }); + }); + + test('does not include GUARDRAIL_ID when not provided', () => { + const { template } = createStack(); + const functions = template.findResources('AWS::Lambda::Function'); + for (const [, fn] of Object.entries(functions)) { + const envVars = (fn as any).Properties.Environment?.Variables ?? {}; + expect(envVars).not.toHaveProperty('GUARDRAIL_ID'); + expect(envVars).not.toHaveProperty('GUARDRAIL_VERSION'); + } + }); + + test('grants bedrock:ApplyGuardrail scoped to guardrail ARN when guardrailId is provided', () => { + const { template } = createStack({ guardrailId: 'gr-test-123', guardrailVersion: '1' }); + template.hasResourceProperties('AWS::IAM::Policy', { + PolicyDocument: { + Statement: Match.arrayWith([ + Match.objectLike({ + Action: 'bedrock:ApplyGuardrail', + Effect: 'Allow', + Resource: { + 'Fn::Join': Match.arrayWith([ + Match.arrayWith([ + Match.stringLikeRegexp('guardrail/gr-test-123'), + ]), + ]), + }, + }), + ]), + }, + }); + }); + + test('does not grant bedrock:ApplyGuardrail when guardrailId is not provided', () => { + const { template } = createStack(); + const policies = template.findResources('AWS::IAM::Policy'); + for (const [, policy] of Object.entries(policies)) { + const statements = (policy as any).Properties.PolicyDocument.Statement; + for (const stmt of statements) { + if (typeof stmt.Action === 'string') { + expect(stmt.Action).not.toBe('bedrock:ApplyGuardrail'); + } else if (Array.isArray(stmt.Action)) { + expect(stmt.Action).not.toContain('bedrock:ApplyGuardrail'); + } + } + } + }); + + test('throws when guardrailId is provided without guardrailVersion', () => { + expect(() => createStack({ guardrailId: 'gr-test-123' })).toThrow( + 'guardrailVersion is required when guardrailId is provided', + ); + }); + + test('throws when guardrailVersion is provided without guardrailId', () => { + expect(() => createStack({ guardrailVersion: '1' })).toThrow( + 'guardrailId is required when guardrailVersion is provided', + ); + }); }); diff --git a/cdk/test/handlers/orchestrate-task.test.ts b/cdk/test/handlers/orchestrate-task.test.ts index dfa1d39..e08627c 100644 --- a/cdk/test/handlers/orchestrate-task.test.ts +++ b/cdk/test/handlers/orchestrate-task.test.ts @@ -176,6 +176,49 @@ describe('hydrateAndTransition', () => { expect(payload.max_turns).toBe(100); }); + test('throws when guardrail_blocked is set on hydrated context', async () => { + mockDdbSend.mockResolvedValue({}); + mockHydrateContext.mockResolvedValueOnce({ + ...mockHydratedContext, + guardrail_blocked: 'PR context blocked by content policy', + }); + const prTask = { ...baseTask, task_type: 'pr_iteration', pr_number: 10 }; + await expect(hydrateAndTransition(prTask as any)).rejects.toThrow( + 'Guardrail blocked: PR context blocked by content policy', + ); + + // Verify guardrail_blocked event was emitted before the throw + const putCalls = mockDdbSend.mock.calls + .filter((c: any) => c[0]._type === 'Put') + .map((c: any) => c[0].input.Item); + const guardrailEvent = putCalls.find((item: any) => item.event_type === 'guardrail_blocked'); + expect(guardrailEvent).toBeDefined(); + expect(guardrailEvent.metadata.reason).toBe('PR context blocked by content policy'); + expect(guardrailEvent.metadata.task_type).toBe('pr_iteration'); + expect(guardrailEvent.metadata.pr_number).toBe(10); + expect(guardrailEvent.metadata.sources).toEqual(['task_description']); + expect(guardrailEvent.metadata.token_estimate).toBe(20); + }); + + test('still throws guardrail error when emitTaskEvent fails during guardrail_blocked handling', async () => { + let callCount = 0; + mockDdbSend.mockImplementation(() => { + callCount++; + // First two calls succeed (transitionTask SUBMITTED->HYDRATING, emitTaskEvent hydration_started) + // Third call is emitTaskEvent('guardrail_blocked') — fail it + if (callCount === 3) return Promise.reject(new Error('DDB write failed')); + return Promise.resolve({}); + }); + mockHydrateContext.mockResolvedValueOnce({ + ...mockHydratedContext, + guardrail_blocked: 'PR context blocked by content policy', + }); + const prTask = { ...baseTask, task_type: 'pr_iteration', pr_number: 10 }; + await expect(hydrateAndTransition(prTask as any)).rejects.toThrow( + 'Guardrail blocked: PR context blocked by content policy', + ); + }); + test('hydration_complete event includes source metadata', async () => { mockDdbSend.mockResolvedValue({}); mockHydrateContext.mockResolvedValueOnce(mockHydratedContext); @@ -478,6 +521,15 @@ describe('failTask', () => { expect(mockDdbSend).toHaveBeenCalledTimes(3); }); + test('transitions from HYDRATING to FAILED when called with HYDRATING status', async () => { + mockDdbSend.mockResolvedValue({}); + await failTask('TASK001', 'HYDRATING', 'Guardrail blocked: PR context blocked by content policy', 'user-123', true); + // First call: transitionTask UpdateCommand + const transitionCall = mockDdbSend.mock.calls[0][0]; + expect(transitionCall.input.ExpressionAttributeValues[':fromStatus']).toBe('HYDRATING'); + expect(transitionCall.input.ExpressionAttributeValues[':toStatus']).toBe('FAILED'); + }); + test('handles transition failure gracefully', async () => { mockDdbSend .mockRejectedValueOnce(new Error('Condition failed')) // transitionTask diff --git a/cdk/test/handlers/shared/context-hydration.test.ts b/cdk/test/handlers/shared/context-hydration.test.ts index cc548fb..a4639bb 100644 --- a/cdk/test/handlers/shared/context-hydration.test.ts +++ b/cdk/test/handlers/shared/context-hydration.test.ts @@ -24,6 +24,12 @@ jest.mock('@aws-sdk/client-secrets-manager', () => ({ GetSecretValueCommand: jest.fn((input: unknown) => ({ _type: 'GetSecretValue', input })), })); +const mockBedrockSend = jest.fn(); +jest.mock('@aws-sdk/client-bedrock-runtime', () => ({ + BedrockRuntimeClient: jest.fn(() => ({ send: mockBedrockSend })), + ApplyGuardrailCommand: jest.fn((input: unknown) => ({ _type: 'ApplyGuardrail', input })), +})); + const mockLoadMemoryContext = jest.fn(); jest.mock('../../../src/handlers/shared/memory', () => ({ loadMemoryContext: mockLoadMemoryContext, @@ -32,6 +38,8 @@ jest.mock('../../../src/handlers/shared/memory', () => ({ // Set env vars before importing process.env.GITHUB_TOKEN_SECRET_ARN = 'arn:aws:secretsmanager:us-east-1:123456789012:secret:github-token'; process.env.USER_PROMPT_TOKEN_BUDGET = '100000'; +process.env.GUARDRAIL_ID = 'gr-test-123'; +process.env.GUARDRAIL_VERSION = '1'; import { assemblePrIterationPrompt, @@ -41,8 +49,10 @@ import { estimateTokens, fetchGitHubIssue, fetchGitHubPullRequest, + GuardrailScreeningError, hydrateContext, resolveGitHubToken, + screenWithGuardrail, type GitHubIssueContext, type IssueComment, } from '../../../src/handlers/shared/context-hydration'; @@ -686,6 +696,7 @@ describe('hydrateContext', () => { .mockResolvedValueOnce(makeGraphQLThreadsResponse([])) .mockResolvedValueOnce({ ok: true, json: async () => ([]) }) .mockResolvedValueOnce({ ok: true, json: async () => ([]) }); + mockBedrockSend.mockResolvedValueOnce({ action: 'NONE' }); const task = { ...baseTask, @@ -986,3 +997,145 @@ describe('assemblePrIterationPrompt', () => { expect(result).toContain('`src/util.ts:5`'); }); }); + +// --------------------------------------------------------------------------- +// screenWithGuardrail +// --------------------------------------------------------------------------- + +describe('screenWithGuardrail', () => { + test('returns NONE when guardrail allows the text', async () => { + mockBedrockSend.mockResolvedValueOnce({ action: 'NONE' }); + const result = await screenWithGuardrail('safe text', 'TASK001'); + expect(result).toBe('NONE'); + expect(mockBedrockSend).toHaveBeenCalledTimes(1); + }); + + test('returns GUARDRAIL_INTERVENED when guardrail blocks the text', async () => { + mockBedrockSend.mockResolvedValueOnce({ action: 'GUARDRAIL_INTERVENED' }); + const result = await screenWithGuardrail('malicious text', 'TASK001'); + expect(result).toBe('GUARDRAIL_INTERVENED'); + }); + + test('throws GuardrailScreeningError on Bedrock error (fail-closed)', async () => { + mockBedrockSend.mockRejectedValueOnce(new Error('Service unavailable')); + const error = await screenWithGuardrail('some text', 'TASK001').catch((e: unknown) => e); + expect(error).toBeInstanceOf(GuardrailScreeningError); + expect((error as GuardrailScreeningError).message).toBe('Guardrail screening unavailable: Service unavailable'); + expect((error as GuardrailScreeningError).cause).toBeInstanceOf(Error); + expect(((error as GuardrailScreeningError).cause as Error).message).toBe('Service unavailable'); + }); +}); + +// --------------------------------------------------------------------------- +// hydrateContext — guardrail screening for PR tasks +// --------------------------------------------------------------------------- + +describe('hydrateContext — guardrail screening', () => { + const basePrTask = { + task_id: 'TASK-PR-001', + user_id: 'user-123', + status: 'SUBMITTED', + repo: 'org/repo', + branch_name: 'bgagent/TASK-PR-001/fix', + channel_source: 'api', + status_created_at: 'SUBMITTED#2024-01-01T00:00:00Z', + created_at: '2024-01-01T00:00:00Z', + updated_at: '2024-01-01T00:00:00Z', + task_type: 'pr_iteration', + pr_number: 10, + }; + + function mockPrFetch(): void { + mockSmSend.mockResolvedValueOnce({ SecretString: 'ghp_test' }); + mockFetch + .mockResolvedValueOnce({ + ok: true, + json: async () => ({ + number: 10, title: 'Test PR', body: 'body', head: { ref: 'feat' }, base: { ref: 'main' }, state: 'open', + }), + }) + .mockResolvedValueOnce(makeGraphQLThreadsResponse([])) + .mockResolvedValueOnce({ ok: true, json: async () => ([]) }) + .mockResolvedValueOnce({ ok: true, json: async () => ([]) }); + } + + test('returns guardrail_blocked when PR context is blocked', async () => { + mockPrFetch(); + mockBedrockSend.mockResolvedValueOnce({ action: 'GUARDRAIL_INTERVENED' }); + + const result = await hydrateContext(basePrTask as any); + expect(result.guardrail_blocked).toBe('PR context blocked by content policy'); + expect(result.user_prompt).toContain('Pull Request #10'); + expect(result.resolved_branch_name).toBe('feat'); + expect(result.resolved_base_branch).toBe('main'); + expect(result.sources).toContain('pull_request'); + expect(result.token_estimate).toBeGreaterThan(0); + expect(result.version).toBe(1); + }); + + test('proceeds normally when PR context passes guardrail', async () => { + mockPrFetch(); + mockBedrockSend.mockResolvedValueOnce({ action: 'NONE' }); + + const result = await hydrateContext(basePrTask as any); + expect(result.guardrail_blocked).toBeUndefined(); + expect(result.user_prompt).toContain('Pull Request #10'); + }); + + test('throws when guardrail screening fails (fail-closed)', async () => { + mockPrFetch(); + mockBedrockSend.mockRejectedValueOnce(new Error('Bedrock timeout')); + + await expect(hydrateContext(basePrTask as any)).rejects.toThrow('Guardrail screening unavailable: Bedrock timeout'); + }); + + test('returns guardrail_blocked for pr_review task type', async () => { + mockSmSend.mockResolvedValueOnce({ SecretString: 'ghp_test' }); + mockFetch + .mockResolvedValueOnce({ + ok: true, + json: async () => ({ + number: 20, title: 'Review PR', body: 'body', head: { ref: 'review-branch' }, base: { ref: 'main' }, state: 'open', + }), + }) + .mockResolvedValueOnce(makeGraphQLThreadsResponse([])) + .mockResolvedValueOnce({ ok: true, json: async () => ([]) }) + .mockResolvedValueOnce({ ok: true, json: async () => ([]) }); + mockBedrockSend.mockResolvedValueOnce({ action: 'GUARDRAIL_INTERVENED' }); + + const prReviewTask = { + ...basePrTask, + task_type: 'pr_review', + pr_number: 20, + }; + const result = await hydrateContext(prReviewTask as any); + expect(result.guardrail_blocked).toBe('PR context blocked by content policy'); + expect(mockBedrockSend).toHaveBeenCalledTimes(1); + }); + + test('does not invoke guardrail for new_task type', async () => { + mockSmSend.mockResolvedValueOnce({ SecretString: 'ghp_test' }); + mockFetch.mockResolvedValueOnce({ + ok: true, + json: async () => ({ number: 42, title: 'Bug', body: 'Details', comments: 0 }), + }); + + const newTask = { + task_id: 'TASK-NEW-001', + user_id: 'user-123', + status: 'SUBMITTED', + repo: 'org/repo', + branch_name: 'bgagent/TASK-NEW-001/fix', + channel_source: 'api', + status_created_at: 'SUBMITTED#2024-01-01T00:00:00Z', + created_at: '2024-01-01T00:00:00Z', + updated_at: '2024-01-01T00:00:00Z', + task_type: 'new_task', + issue_number: 42, + task_description: 'Fix it', + }; + const result = await hydrateContext(newTask as any); + expect(result.guardrail_blocked).toBeUndefined(); + expect(mockBedrockSend).not.toHaveBeenCalled(); + }); +}); diff --git a/cdk/test/handlers/shared/create-task-core.test.ts b/cdk/test/handlers/shared/create-task-core.test.ts index 9f81bdb..83a7f58 100644 --- a/cdk/test/handlers/shared/create-task-core.test.ts +++ b/cdk/test/handlers/shared/create-task-core.test.ts @@ -114,14 +114,15 @@ describe('createTaskCore', () => { expect(JSON.parse(result.body).error.message).toContain('content policy'); }); - test('creates task when guardrail service fails (fail-open)', async () => { + test('returns 503 when guardrail service fails (fail-closed)', async () => { mockBedrockSend.mockRejectedValueOnce(new Error('Bedrock service unavailable')); const result = await createTaskCore( { repo: 'org/repo', task_description: 'Fix it' }, makeContext(), 'req-1', ); - expect(result.statusCode).toBe(201); + expect(result.statusCode).toBe(503); + expect(JSON.parse(result.body).error.message).toContain('Content screening is temporarily unavailable'); }); test('returns 409 for duplicate idempotency key', async () => { diff --git a/docs/design/API_CONTRACT.md b/docs/design/API_CONTRACT.md index 2946981..3bde6b6 100644 --- a/docs/design/API_CONTRACT.md +++ b/docs/design/API_CONTRACT.md @@ -189,8 +189,10 @@ For `pr_iteration` and `pr_review` tasks, `branch_name` is initially set to `pen | `400` | `VALIDATION_ERROR` | Missing required fields, invalid repo format, no task description or issue or PR number, invalid `task_type`, `pr_number` provided without `task_type: 'pr_iteration'` or `'pr_review'`, `pr_number` missing when `task_type` is `pr_iteration` or `pr_review`, invalid `max_turns` (not an integer or outside 1–500 range), invalid `max_budget_usd` (not a number or outside 0.01–100 range). | | `401` | `UNAUTHORIZED` | Missing or invalid auth token. | | `409` | `DUPLICATE_TASK` | Idempotency key matches an existing task (returns the existing task in `data`). | +| `400` | `GUARDRAIL_BLOCKED` | Task description blocked by content screening (prompt injection detected). | | `422` | `REPO_NOT_ONBOARDED` | Repository is not registered with the platform. Repos are onboarded via CDK deployment (`Blueprint` construct), not via a runtime API. See [REPO_ONBOARDING.md](./REPO_ONBOARDING.md). | | `429` | `RATE_LIMIT_EXCEEDED` | User exceeded the per-user rate limit. | +| `503` | `SERVICE_UNAVAILABLE` | Content screening service temporarily unavailable. Retry with backoff. | --- @@ -579,8 +581,10 @@ HMAC verification is performed by the handler (not the authorizer) because API G | Status | Code | Condition | |---|---|---| | `400` | `VALIDATION_ERROR` | Missing required fields, invalid repo format, no task description or issue or PR number, invalid `task_type`, invalid `pr_number`, invalid `max_turns`, invalid `max_budget_usd`. | +| `400` | `GUARDRAIL_BLOCKED` | Task description blocked by content screening. | | `401` | `UNAUTHORIZED` | Missing webhook headers, webhook not found, revoked, or invalid signature. | | `409` | `DUPLICATE_TASK` | Idempotency key matches an existing task. | +| `503` | `SERVICE_UNAVAILABLE` | Content screening service temporarily unavailable. | **Channel metadata:** Tasks created via webhook record `channel_source: 'webhook'` and `channel_metadata` including `webhook_id`, `source_ip`, `user_agent`, and `api_request_id` for audit purposes. @@ -615,9 +619,10 @@ Rate limit status is communicated via response headers (see Standard response he | `REPO_NOT_ONBOARDED` | 422 | Repository is not registered with the platform. Repos are onboarded via CDK deployment, not via a runtime API. There are no `/v1/repos` endpoints. | | `PR_NOT_FOUND_OR_CLOSED` | 422 | For `pr_iteration` and `pr_review` tasks: the specified PR does not exist, is not open, or is not accessible with the configured GitHub token. Checked during the orchestrator's pre-flight step. | | `INVALID_STEP_SEQUENCE` | 500 | The blueprint's step sequence is invalid (missing required steps or incorrect ordering). This indicates a CDK configuration error that slipped past synth-time validation. Visible via `GET /v1/tasks/{id}` as `error_code`. See [REPO_ONBOARDING.md](./REPO_ONBOARDING.md#step-sequence-validation). | +| `GUARDRAIL_BLOCKED` | 400 | Task description was blocked by Bedrock Guardrail content screening (prompt injection detected). Revise the task description and retry. | | `RATE_LIMIT_EXCEEDED` | 429 | User exceeded rate limit. | | `INTERNAL_ERROR` | 500 | Unexpected server error. Includes `request_id` for support. | -| `SERVICE_UNAVAILABLE` | 503 | Downstream dependency unavailable (e.g. DynamoDB, AgentCore). Retry with backoff. | +| `SERVICE_UNAVAILABLE` | 503 | Downstream dependency unavailable (e.g. DynamoDB, AgentCore, Bedrock Guardrails). Retry with backoff. | --- diff --git a/docs/design/ARCHITECTURE.md b/docs/design/ARCHITECTURE.md index fabba88..19bb916 100644 --- a/docs/design/ARCHITECTURE.md +++ b/docs/design/ARCHITECTURE.md @@ -85,7 +85,7 @@ For the full orchestrator design — task state machine, execution model, failur The steps below are the blueprint in action: deterministic orchestration (1–2, 4) and one agentic step (3). -1. **Deterministic:** The task orchestrator runs admission control, then context hydration (task id, issue body, user message, memory context → assembled prompt). When AgentCore Memory is configured, context hydration loads repository knowledge (semantic search) and past task episodes (episodic search) in parallel and injects them into the system prompt. See [MEMORY.md](./MEMORY.md). +1. **Deterministic:** The task orchestrator runs admission control, then context hydration (task id, issue body, user message, memory context → assembled prompt). When AgentCore Memory is configured, context hydration loads repository knowledge (semantic search) and past task episodes (episodic search) in parallel and injects them into the system prompt. For PR tasks, the assembled prompt is screened through Bedrock Guardrails for prompt injection before proceeding to session start. See [MEMORY.md](./MEMORY.md). 2. **Deterministic:** The orchestrator starts the agent session (compute environment) and passes in the prompt. The prompt version (SHA-256 hash of deterministic prompt parts) is stored on the task record for traceability. 3. **Agentic:** The agent runs in the isolated environment: clone repo, create branch, edit code, commit often, run tests and lint, create PR. Commits are attributed via git trailers (`Task-Id`, `Prompt-Version`). At task end, the agent writes memory (task episode + repo learnings) to AgentCore Memory. The orchestrator does not execute this logic; it only waits for the session to finish. 4. **Deterministic:** The orchestrator infers the result (e.g. by querying GitHub for a PR on the agent's branch), updates task status, and finalizes (result inference, cleanup). If the agent did not write memory (crash, timeout), the orchestrator writes a fallback episode. A validation step may run here (e.g. configurable post-agent checks); see repo onboarding for customizing these steps. @@ -204,6 +204,7 @@ Each concept has a **source-of-truth document** and one or more documents that r | Live session replay | ROADMAP.md (Iter 4) | API_CONTRACT.md | | PR iteration task type | API_CONTRACT.md, ORCHESTRATOR.md | USER_GUIDE.md, PROMPT_GUIDE.md, SECURITY.md, AGENT_HARNESS.md | | PR review task type | API_CONTRACT.md, ORCHESTRATOR.md | USER_GUIDE.md, PROMPT_GUIDE.md, SECURITY.md, AGENT_HARNESS.md | +| Bedrock Guardrail input screening | SECURITY.md (Input validation and guardrails) | ORCHESTRATOR.md (Context hydration), API_CONTRACT.md (Error codes), OBSERVABILITY.md (Alarms), ROADMAP.md (3c) | ### Per-repo model selection diff --git a/docs/design/INPUT_GATEWAY.md b/docs/design/INPUT_GATEWAY.md index b279ffc..10d71fc 100644 --- a/docs/design/INPUT_GATEWAY.md +++ b/docs/design/INPUT_GATEWAY.md @@ -38,7 +38,7 @@ In short: **every input channel connects through this central point; the gateway Every channel-specific payload must be transformed into the same internal message structure. The rest of the system only ever sees this normalized form. - **Validation** - The gateway must validate normalized messages (required fields, types, allowed actions, target repo/issue refs, size limits) and reject malformed or invalid requests with clear errors. + The gateway must validate normalized messages (required fields, types, allowed actions, target repo/issue refs, size limits) and reject malformed or invalid requests with clear errors. Task descriptions are additionally screened by Amazon Bedrock Guardrails for prompt injection at submission time (fail-closed). See [SECURITY.md](./SECURITY.md). - **Access control** The gateway enforces who can do what (e.g. only the task owner can cancel; only authenticated users can create tasks). This may be defined per channel or globally. diff --git a/docs/design/OBSERVABILITY.md b/docs/design/OBSERVABILITY.md index 480fec5..1ab1ab0 100644 --- a/docs/design/OBSERVABILITY.md +++ b/docs/design/OBSERVABILITY.md @@ -126,6 +126,7 @@ Both are one-time, account-level setup steps — not managed by CDK. - Task creation, status transitions (SUBMITTED → HYDRATING → RUNNING → COMPLETED / FAILED / CANCELLED / TIMED_OUT), and terminal state. - **Step-level events** — The blueprint framework emits events for each pipeline step: `{step_name}_started`, `{step_name}_completed`, `{step_name}_failed`. For built-in steps these overlap with the fixed event types (e.g. `hydration_started`). For custom Lambda steps, the step name is user-defined (e.g. `sast-scan_started`, `prepare-environment_completed`). See [REPO_ONBOARDING.md](./REPO_ONBOARDING.md#blueprint-execution-framework) and [API_CONTRACT.md](./API_CONTRACT.md). +- **Guardrail screening events** — `guardrail_blocked` (content blocked by Bedrock Guardrail during hydration, with metadata: `reason`, `task_type`, `pr_number`, `sources`, `token_estimate`). Screening failures are logged with structured `metric_type` fields (not emitted as task events). - Time in each state (e.g. time in HYDRATING, time RUNNING, cold start to first agent activity). - Correlation with a task id and user id so users and operators can filter by task or user. @@ -161,6 +162,8 @@ Plans call for defining at least: - Active tasks (RUNNING count). - Pending tasks (SUBMITTED count). - Task completion rate (success vs failed/cancelled/timed out). +- Guardrail screening failure rate (`metric_type: 'guardrail_screening_failure'` in structured logs — use CloudWatch Logs Insights metric filter). +- Guardrail blocked rate (`guardrail_blocked` task events). These can be emitted as custom CloudWatch metrics (or equivalent) and used in dashboards and alarms. @@ -182,6 +185,7 @@ Critical alarms called out in the plans include: - **Orchestration / execution failures** — durable function execution failures (e.g. repeated session start failures). - **Agent crash rate** — spike or sustained high rate of agent/session failures. - **Pending depth** — SUBMITTED tasks exceeding a threshold (signals that the system is at capacity, e.g. AgentCore concurrent session quota bottleneck); may warrant a quota increase or capacity planning. +- **Guardrail screening failures** — sustained Bedrock Guardrail API failures blocking task submissions and PR task hydration (fail-closed). Filter: `metric_type = "guardrail_screening_failure"`. Indicates a Bedrock outage affecting task throughput. ## Code attribution and capture for agent search @@ -237,6 +241,7 @@ When an alarm fires, the operator should follow the corresponding procedure. The | **Orchestration failures** | 1. Check Lambda Durable Functions execution logs. 2. Identify the failing step (load-blueprint, admission-control, start-session, etc.). 3. For `INVALID_STEP_SEQUENCE`: fix the Blueprint CDK construct config and redeploy. 4. For transient failures (DynamoDB throttle, AgentCore timeout): verify service health; the durable execution should auto-retry. | | **Agent crash rate spike** | 1. Check for common root causes: model API errors (Bedrock throttling), compute quota exceeded (AgentCore session limit), image pull failures. 2. Query recent failed tasks by `error_code` for patterns. 3. If quota-related: request a quota increase or reduce concurrency limits. | | **Submitted backlog over threshold** | 1. Check system concurrency: are all slots occupied by running tasks? 2. If yes: the system is at capacity. Options: increase per-user or system-wide concurrency limits (if quota allows), or wait for running tasks to complete. 3. If no: check for orchestrator backlog (tasks in SUBMITTED state not being picked up). | +| **Guardrail screening failures** | 1. Check Bedrock service health in the AWS console. 2. Query CloudWatch Logs: `filter metric_type = "guardrail_screening_failure" | stats count() by bin(5m)`. 3. If Bedrock is down, tasks will fail at submission (503) and during hydration (FAILED). No action needed — tasks will succeed once Bedrock recovers. 4. If failures are unexpected, check guardrail configuration (`GUARDRAIL_ID`, `GUARDRAIL_VERSION` env vars on the orchestrator Lambda). | ## Deployment safety for long-running sessions diff --git a/docs/design/ORCHESTRATOR.md b/docs/design/ORCHESTRATOR.md index 8960e9d..cdfa7d3 100644 --- a/docs/design/ORCHESTRATOR.md +++ b/docs/design/ORCHESTRATOR.md @@ -134,7 +134,7 @@ The **target state** (Iteration 2 and beyond) introduces a durable orchestrator | `SUBMITTED` | `FAILED` | Admission rejected | Repo not onboarded, rate limit, validation failure | | `SUBMITTED` | `CANCELLED` | User cancels | Cancel request received | | `HYDRATING` | `RUNNING` | Hydration complete, session invoked | `invoke_agent_runtime` returns session ID | -| `HYDRATING` | `FAILED` | Hydration error | GitHub API failure, memory failure, prompt assembly error | +| `HYDRATING` | `FAILED` | Hydration error | GitHub API failure, memory failure, prompt assembly error, guardrail content blocked, guardrail service unavailable | | `HYDRATING` | `CANCELLED` | User cancels during hydration | Cancel request received | | `RUNNING` | `FINALIZING` | Session ends (response received or session status = terminated) | — | | `RUNNING` | `CANCELLED` | User cancels | `stop_runtime_session` called, then transition | @@ -179,7 +179,7 @@ See the Admission control section for details. Validates that the task is allowe #### Step 2: Context hydration (deterministic) -See the Context hydration section for details. Assembles the agent's prompt from multiple sources depending on task type. For `new_task`: user message, GitHub issue (title, body, comments), memory, repo configuration, and platform defaults. For `pr_iteration`: PR metadata, review comments, diff summary, and optional user instructions. An additional **pre-flight** sub-step verifies PR accessibility when `pr_number` is set (see [preflight.ts](../../cdk/src/handlers/shared/preflight.ts)). The output is a fully assembled prompt, ready to pass to the compute session. +See the Context hydration section for details. Assembles the agent's prompt from multiple sources depending on task type. For `new_task`: user message, GitHub issue (title, body, comments), memory, repo configuration, and platform defaults. For `pr_iteration`: PR metadata, review comments, diff summary, and optional user instructions. An additional **pre-flight** sub-step verifies PR accessibility when `pr_number` is set (see [preflight.ts](../../cdk/src/handlers/shared/preflight.ts)). For PR tasks, the assembled prompt is screened through Amazon Bedrock Guardrails for prompt injection before the agent receives it. The output is a fully assembled prompt, ready to pass to the compute session. #### Step 3: Session start and agent execution (deterministic start + agentic execution) @@ -271,11 +271,12 @@ The orchestrator's `hydrateAndTransition()` function calls `hydrateContext()` (` 4. **Assembles the user prompt** based on task type: - **`new_task`**: A structured markdown document with Task ID, Repository, GitHub Issue section, and Task section. The format mirrors the Python `assemble_prompt()` in `agent/entrypoint.py`. - **`pr_iteration`**: Assembled by `assemblePrIterationPrompt()` — includes PR metadata (number, title, body), the diff summary (changed files and patches), review comments (inline and conversation), and optional user instructions from `task_description`. -5. **Returns a `HydratedContext` object** containing `version`, `user_prompt`, `issue`, `sources`, `token_estimate`, `truncated`, and for `pr_iteration`/`pr_review` tasks: `resolved_branch_name` and `resolved_base_branch`. +5. **Screens through Bedrock Guardrail** (PR tasks only): For `pr_iteration` and `pr_review` tasks, the assembled user prompt is screened through Amazon Bedrock Guardrails (`screenWithGuardrail()`) using the `PROMPT_ATTACK` content filter. If the guardrail detects prompt injection, `guardrail_blocked` is set on the result and the orchestrator fails the task. If the Bedrock API is unavailable, a `GuardrailScreeningError` is thrown (fail-closed — unscreened content never reaches the agent). Task descriptions for all task types are screened at submission time in `create-task-core.ts`. +6. **Returns a `HydratedContext` object** containing `version`, `user_prompt`, `issue`, `sources`, `token_estimate`, `truncated`, and for `pr_iteration`/`pr_review` tasks: `resolved_branch_name` and `resolved_base_branch`. The hydrated context is passed to the agent as a new `hydrated_context` field in the invocation payload, alongside the existing legacy fields (`repo_url`, `task_id`, `branch_name`, `issue_number`, `prompt`). The agent checks for `hydrated_context` with `version == 1`; if present, it uses the pre-assembled `user_prompt` directly and skips in-container GitHub fetching and prompt assembly. If absent (e.g. during a deployment rollout or when the secret ARN isn't configured), the agent falls back to its existing behavior. -**Graceful degradation:** If any step fails (Secrets Manager unavailable, GitHub API error, network timeout), the orchestrator proceeds with whatever context is available. The worst case is a minimal prompt with just the task ID and repository — the agent can still attempt its own GitHub fetch as a fallback via the legacy `issue_number` field. +**Graceful degradation:** If any step fails (Secrets Manager unavailable, GitHub API error, network timeout), the orchestrator proceeds with whatever context is available. The worst case is a minimal prompt with just the task ID and repository — the agent can still attempt its own GitHub fetch as a fallback via the legacy `issue_number` field. **Exception:** `GuardrailScreeningError` is NOT caught by the fallback — it propagates to fail the task. This is intentional: unscreened content must never reach the agent (fail-closed). **PR iteration branch resolution:** After hydration, if `resolved_branch_name` is present on the hydrated context, the orchestrator updates the task record's `branch_name` in DynamoDB from the placeholder (`pending:pr_resolution`) to the PR's actual `head_ref`. This ensures the task record always reflects the real branch name that the agent will push to. @@ -285,6 +286,7 @@ The orchestrator emits two task events during hydration: - `hydration_started` — emitted when the task transitions to `HYDRATING` - `hydration_complete` — emitted after context assembly, with metadata: `sources` (array of context sources used, e.g. `["issue", "task_description"]`), `token_estimate` (estimated token count of the assembled prompt), `truncated` (whether the token budget was exceeded) +- `guardrail_blocked` — emitted when Bedrock Guardrail blocks content during hydration, with metadata: `reason`, `task_type`, `pr_number`, `sources`, `token_estimate` ### AgentCore Gateway — evaluated and deferred @@ -520,6 +522,8 @@ This section uses an FMEA (Failure Mode and Effects Analysis) approach: for each | GitHub API unavailable or rate limited | Cannot fetch issue context | Retry with backoff. If the issue is essential (issue-based task), fail the task. If the user also provided a task description, proceed with degraded context (no issue body). | | Memory service unavailable | Cannot retrieve past insights | Proceed without memory context (memory is an enrichment, not required for MVP). Log warning. | | Prompt exceeds token budget | Agent may lose coherence or fail to start | Truncate lower-priority sources (old comments, memory) to fit budget. | +| Bedrock Guardrail blocks content | Prompt injection or adversarial content detected | Task transitions to FAILED. No retry — content is adversarial. The `guardrail_blocked` event is emitted with metadata. | +| Bedrock Guardrail API unavailable | Cannot screen content (fail-closed) | Task transitions to FAILED. Operator should check Bedrock service health. Tasks will succeed once Bedrock recovers. | ### Session start failures diff --git a/docs/design/SECURITY.md b/docs/design/SECURITY.md index adef99e..260f596 100644 --- a/docs/design/SECURITY.md +++ b/docs/design/SECURITY.md @@ -50,7 +50,11 @@ The agent runs with **full permissions inside the sandbox** but cannot escape it - **Per-repo tool profiles:** Stored in the onboarding config and loaded by the orchestrator during context hydration. The agent harness configures the tool set based on the profile. See [REPO_ONBOARDING.md](./REPO_ONBOARDING.md) for per-repo configuration. - **Enforcement mechanism:** Tools are exposed to the agent through **AgentCore Gateway**, which provides built-in mechanisms to enforce access control. The Gateway acts as a managed proxy between the agent and external tools/APIs — only tools registered and authorized in the Gateway are reachable. Per-repo tool profiles map to Gateway tool configurations: the orchestrator registers the allowed tool set for each session, and the Gateway enforces it. This is a platform-level enforcement boundary (not a prompt-level suggestion), meaning the agent cannot bypass it by requesting tools that are not registered. For tools not mediated by the Gateway (e.g. direct bash commands), enforcement relies on the sandbox environment (filesystem permissions, network egress rules, and the bash allowlist configured in the agent harness). - **Rationale:** More tools increase the agent's search space, making behavior less predictable and harder to evaluate. A minimal default with opt-in expansion balances capability with reliability. -- **Guardrails** — Amazon Bedrock Guardrails are deployed for task input screening. The `task-input-guardrail` applies a `PROMPT_ATTACK` content filter at `HIGH` strength on task descriptions at submission time. This provides a first layer of defense against prompt injection in user-supplied task descriptions. +- **Guardrails** — Amazon Bedrock Guardrails are deployed for task input screening. The `task-input-guardrail` applies a `PROMPT_ATTACK` content filter at `HIGH` strength on task descriptions at submission time. This provides a first layer of defense against prompt injection in user-supplied task descriptions. A second screening point runs during context hydration for PR tasks (`pr_iteration`, `pr_review`), screening the assembled prompt (PR body, review comments, conversation comments, diff summary, task description) before the agent receives it. Both screening points follow a **fail-closed** pattern: if the Bedrock Guardrail API is unavailable, the task is rejected (submission-time returns HTTP 503; hydration-time transitions the task to FAILED). This ensures unscreened content never reaches the agent, even during Bedrock outages. Screening failures are logged with a structured `metric_type: 'guardrail_screening_failure'` field for CloudWatch alerting: + ``` + filter metric_type = "guardrail_screening_failure" | stats count() by bin(5m) + ``` + Operators should create a CloudWatch Logs Insights metric filter or alarm on this field to detect sustained Bedrock outages affecting task throughput. - **Task description length limit** — Task descriptions are capped at 2,000 characters to bound the attack surface for prompt injection and reduce the risk of resource exhaustion from oversized payloads. ## Blueprint custom steps trust boundary @@ -195,11 +199,11 @@ AgentCore Memory has **no native backup mechanism**. This is a significant gap f ## Known limitations - **Single GitHub OAuth token** — one token may be shared for all users and repos the platform can access. Any authenticated user can trigger agent work against any repo that token can access. There is no per-user repo scoping. -- **Guardrails are input-only** — the `PROMPT_ATTACK` filter screens task descriptions at submission. No guardrails are applied to model output during agent execution or to review feedback entering the memory system. For `pr_iteration` and `pr_review` tasks, the PR context (review comments, diff, PR body, issue comments) fetched during context hydration is **not** screened by Bedrock Guardrails — it bypasses the submission-time filter entirely. A `pr_iteration` or `pr_review` task submitted without a `task_description` receives no guardrail screening at all. +- **Guardrails are input-only** — the `PROMPT_ATTACK` filter screens task descriptions at submission and assembled PR prompts during context hydration. No guardrails are applied to model output during agent execution or to review feedback entering the memory system. For `pr_iteration` and `pr_review` tasks, the assembled user prompt (including PR body, review comments, conversation comments, diff summary, and task description) is screened through the Bedrock Guardrail during hydration; if blocked, the task fails with a descriptive error. Guardrail screening follows a fail-closed pattern: a Bedrock outage blocks task submissions (HTTP 503) and fails PR tasks during hydration. - **No memory content validation** — retrieved memory records are injected into the agent's context without sanitization, injection pattern scanning, or trust scoring. This is the most critical memory security gap (OWASP ASI06). See [MEMORY.md](./MEMORY.md#memory-security-analysis) for the full gap analysis and [ROADMAP.md Iteration 3e](../guides/ROADMAP.md) for the remediation plan. - **No memory provenance or integrity checking** — memory entries carry no source attribution, content hashing, or trust metadata. The system cannot distinguish agent-generated memory from externally-influenced content. -- **GitHub issue content as untrusted input** — issue bodies and comments (attacker-controlled) are injected into the agent's context during hydration without trust differentiation. -- **PR review comments as untrusted input** — for `pr_iteration` and `pr_review` tasks, review comments, PR body, and conversation comments are fetched and injected into the agent's context. These are attacker-controlled inputs subject to the same prompt injection risks as issue comments. Unlike task descriptions, PR context is not screened by the Bedrock Guardrails `PROMPT_ATTACK` filter. For `pr_review` tasks, defense-in-depth mitigates the risk: the agent runs without `Write` or `Edit` tools, so even if prompt injection succeeds, the agent cannot modify files or push code. +- **GitHub issue content as untrusted input** — issue bodies and comments (attacker-controlled) are injected into the agent's context during hydration for `new_task` tasks without guardrail screening. Only the user-supplied `task_description` is screened at submission time; the fetched issue content bypasses both screening points. This is a known gap — extending hydration-time guardrail screening to `new_task` issue content is planned as a follow-up. +- **PR review comments as untrusted input** — for `pr_iteration` and `pr_review` tasks, review comments, PR body, and conversation comments are fetched and injected into the agent's context. These are attacker-controlled inputs subject to the same prompt injection risks as issue comments. The assembled PR prompt is now screened by the Bedrock Guardrails `PROMPT_ATTACK` filter during context hydration; if prompt injection is detected, the task fails before reaching the agent. For `pr_review` tasks, additional defense-in-depth mitigates residual risk: the agent runs without `Write` or `Edit` tools, so even if injection bypasses the guardrail, the agent cannot modify files or push code. - **No memory rollback or quarantine** — the 365-day AgentCore Memory expiration is the only cleanup mechanism. There is no snapshot, rollback, or quarantine capability for suspected poisoned entries. - **No MFA** — Cognito MFA is disabled (CLI-based auth flow). Should be enabled for production deployments. - **No customer-managed KMS** — all encryption at rest uses AWS-managed keys. Customer-managed KMS can be added if required by compliance policy. diff --git a/docs/guides/DEVELOPER_GUIDE.md b/docs/guides/DEVELOPER_GUIDE.md index c4f32a9..79364a4 100644 --- a/docs/guides/DEVELOPER_GUIDE.md +++ b/docs/guides/DEVELOPER_GUIDE.md @@ -428,7 +428,7 @@ cdk/src/ │ ├── webhook-create-task.ts # POST /webhooks/tasks Lambda (HMAC-SHA256 verification) │ └── shared/ │ ├── create-task-core.ts # Shared task creation logic (Cognito + webhook) -│ ├── context-hydration.ts # GitHub issue fetching, prompt assembly, token budget +│ ├── context-hydration.ts # GitHub issue fetching, prompt assembly, token budget, guardrail screening │ ├── gateway.ts # User extraction, webhook context, branch naming │ ├── logger.ts # Structured logger │ ├── orchestrator.ts # Orchestrator step helpers (DDB, AgentCore, concurrency) diff --git a/docs/guides/ROADMAP.md b/docs/guides/ROADMAP.md index 5a00c00..b1f11eb 100644 --- a/docs/guides/ROADMAP.md +++ b/docs/guides/ROADMAP.md @@ -161,6 +161,7 @@ These practices apply continuously across iterations and are not treated as one- - **Tier 3 — Risk and blast radius analysis** — Analyze the scope and impact of the agent's changes to detect unintended side effects in other parts of the codebase. Includes: dependency graph analysis (what modules/functions consume the changed code), change surface area (number of files, lines, and modules touched), semantic impact assessment (does the change alter public APIs, shared types, configuration, or database schemas), and regression risk scoring. Produces a **risk level** (low / medium / high / critical) attached to the PR as a label and included in the validation report. High-risk changes may require explicit human approval before merge (foundation for the HITL approval mode in Iteration 6). The risk level considers: number of downstream dependents affected, whether the change touches shared infrastructure or core abstractions, test coverage of the affected area, and whether the change introduces new external dependencies. - **PR risk level and validation report** — Every agent-created PR includes a structured **validation report** (as a PR comment or check run) summarizing: Tier 1 results (pass/fail per tool), Tier 2 findings (code quality issues by severity), Tier 3 risk assessment (risk level, blast radius summary, affected modules). The PR is labeled with the computed risk level (`risk:low`, `risk:medium`, `risk:high`, `risk:critical`). Risk level is persisted in the task record for evaluation and trending. See [EVALUATION.md](../design/EVALUATION.md#pr-risk-level). - [x] **Other task types: PR review and PR-iteration** — Support additional task types beyond "implement from issue": **iterate on pull request** (`pr_iteration`) reads review comments and addresses them (implement changes, push updates, post summary). **Review pull request** (`pr_review`) is a read-only task type where the agent analyzes a PR's changes and posts structured review comments via the GitHub Reviews API. The `pr_review` agent runs without `Write` or `Edit` tools (defense-in-depth), skips `ensure_committed` and push, and treats build status as informational only. Each review comment uses a structured format: type (comment/question/issue/good_point), severity for issues (minor/medium/major/critical), title, description with memory attribution, proposed fix, and a ready-to-use AI prompt. The CLI exposes `--review-pr ` (mutually exclusive with `--pr`). +- [x] **Input guardrail screening (Bedrock Guardrails)** — Amazon Bedrock Guardrails screen task descriptions at submission time and assembled PR prompts during context hydration (`pr_iteration`, `pr_review`). Uses `PROMPT_ATTACK` content filter at `HIGH` strength. Fail-closed: Bedrock outages block tasks rather than letting unscreened content through. See [SECURITY.md](../design/SECURITY.md). - **Multi-modal input** — Accept text and images (or other modalities) in the task payload; pass through to the agent. Gateway and schema support it; agent harness supports it where available. Primary use case: screenshots of bugs, UI mockups, or design specs attached to issues. **Builds on Iteration 3b:** Memory is operational; this iteration changes the orchestrator blueprint (tiered validation pipeline, new task type) and broadens the input schema. These are independently testable from memory. @@ -255,7 +256,7 @@ Deep research identified **9 memory-layer security gaps** in the current archite - **Adaptive model router with cost-aware cascade** — Per-turn model selection via a lightweight heuristic engine. File reads and simple edits use a cheaper model (Haiku); multi-file refactors use Sonnet; complex reasoning escalates to Opus. Error escalation: if the agent fails twice on the same step, upgrade model for the retry. As the cost budget ceiling approaches, cascade down to cheaper models. Blueprint `modelCascade` config enables per-repo tuning. Potential 30-40% cost reduction on inference-dominated workloads. Requires agent harness changes to support mid-session model switching. - **Advanced evaluation and feedback loop** — Extend the basic evaluation pipeline from Iteration 3d: ML-based or LLM-based trace analysis (not just rules), A/B prompt comparison framework, automated feedback into prompt templates (e.g. "for repo X, always run tests before opening PR"), and per-repo or per-failure-type improvement tracking. Evaluation results can update the repo's agent configuration stored during onboarding. **Optional patterns from adaptive teaching research** (e.g. plan → targeted critique → execution; separate **evaluator** vs **prompt/reflection** roles; fitness from LLM judging plus efficiency metrics; evolution of teaching templates from failed trajectories with Pareto-style candidate sets for diverse failure modes) can inform offline or scheduled improvement of Blueprint prompts and checklists without replacing ABCA's core orchestrator. - **Formal orchestrator verification (TLA+)** — Add a formal specification of the orchestrator in TLA+ and verify it with TLC model checking. Scope includes the task state machine (8 states, valid transitions, terminal states), concurrency admission control (atomic increment + max check), cancellation races (cancel arriving during any orchestration step), reconciler/orchestrator interleavings (counter drift correction while tasks are active), and the polling loop (agent writes terminal status, orchestrator observes and finalizes). Define invariants such as valid-state progression, no illegal transitions, and repo-level safety constraints (for example, at most one active `RUNNING` task per repo when configured). Keep the spec aligned with `src/constructs/task-status.ts` and orchestrator docs so regressions surface as model-check counterexamples before production. -- **Guardrails** — Natural-language or policy-based **guardrails** on agent tool calls using Amazon Bedrock Guardrails. Defends against prompt injection, restricts sensitive content generation, and enforces organizational policies (e.g. "do not modify files in `/infrastructure`"). See [SECURITY.md](../design/SECURITY.md). Guardrails configuration can be per-repo (via onboarding) or platform-wide. +- **Guardrails (output and tool-call)** — Extend Bedrock Guardrails from input screening (implemented in Iteration 3c) to **output filtering** and **agent tool-call guardrails**. Apply content filters to model responses during agent execution, restrict sensitive content generation, and enforce organizational policies (e.g. "do not modify files in `/infrastructure`"). See [SECURITY.md](../design/SECURITY.md). Guardrails configuration can be per-repo (via onboarding) or platform-wide. - **Capability-based security model** — Fine-grained enforcement beyond Bedrock Guardrails, operating at three levels: (1) **Tool-level capabilities** — Bash command allowlist (git, npm, make permitted; curl, wget blocked), configurable per capability tier (standard / elevated / read-only). (2) **File-system scope** — Blueprint declares include/exclude path patterns; Write/Edit/Read tools are filtered to the declared scope. (3) **Input trust scoring** — Authenticated user input = trusted; external GitHub issues = untrusted; PR review comments entering memory = adversarial. Trust level selects the capability set. Essential once review feedback memory (Iter 3d) introduces attacker-controlled content into the agent's context. Blueprint `security` prop configures the capability profile per repo. - **Additional execution environment** — Support an alternative to AgentCore Runtime (e.g. ECS/Fargate, EKS) behind the **ComputeStrategy** interface (see [REPO_ONBOARDING.md](../design/REPO_ONBOARDING.md#compute-strategy-interface)). The orchestrator calls abstract methods (`startSession`, `stopSession`, `pollSession`); the implementation maps to AgentCore, Fargate, or EKS. Repos select the strategy via `compute_type` in their blueprint configuration. Reduces vendor lock-in and enables workloads that exceed AgentCore limits (e.g. GPU, larger images, longer sessions). The ComputeStrategy interface contract is defined in Iteration 3a; Iteration 5 adds alternative implementations. - **Full web dashboard** — Extend the control panel from Iteration 4: detailed dashboards (cost, performance, evaluation), reasoning trace viewer or log explorer (linked to OpenTelemetry traces from AgentCore), task submit/cancel from the UI, and admin views (system health, capacity, user management). @@ -288,12 +289,12 @@ Deep research identified **9 memory-layer security gaps** in the current archite - **Iteration 2** — Production orchestrator, API contract, task management (list/status/cancel), durable execution, observability, threat model, network isolation, basic cost guardrails, CI/CD. - **Iteration 3a** — Repo onboarding, DNS Firewall (domain-level egress filtering), webhook trigger, GitHub Actions, per-repo customization (prompt from repo), data retention, turn/iteration caps, cost budget caps, user prompt guide, agent harness improvements (turn budget, default branch, safety net, lint, softened conventions), operator dashboard, WAF, model invocation logging, input length limits. - **Iteration 3b** ✅ — Memory Tier 1 (repo knowledge, task episodes), insights, agent self-feedback, prompt versioning, per-prompt commit attribution. CDK L2 construct with named semantic + episodic strategies using namespace templates (`/{actorId}/knowledge/`, `/{actorId}/episodes/{sessionId}/`), fail-open memory load/write, orchestrator fallback episode, SHA-256 prompt hashing, git trailer attribution. -- **Iteration 3c** — Per-repo GitHub App credentials, orchestrator pre-flight checks (fail-closed before session start), persistent session storage for select caches (AgentCore Runtime `/mnt/workspace` mount for npm/Claude config; mise/uv/repo on local disk due to FUSE `flock()` limitation), pre-execution task risk classification (model/limits/approval policy selection), tiered validation pipeline (tool validation, code quality analysis, post-execution risk/blast radius analysis), PR risk level, PR review task type (`pr_review` — read-only structured review with tool restriction, defense-in-depth enforcement, CLI `--review-pr` flag), multi-modal input. +- **Iteration 3c** — Per-repo GitHub App credentials, orchestrator pre-flight checks (fail-closed before session start), persistent session storage for select caches (AgentCore Runtime `/mnt/workspace` mount for npm/Claude config; mise/uv/repo on local disk due to FUSE `flock()` limitation), pre-execution task risk classification (model/limits/approval policy selection), tiered validation pipeline (tool validation, code quality analysis, post-execution risk/blast radius analysis), PR risk level, PR review task type (`pr_review` — read-only structured review with tool restriction, defense-in-depth enforcement, CLI `--review-pr` flag), input guardrail screening (Bedrock Guardrails, fail-closed), multi-modal input. - **Iteration 3d** — Review feedback memory loop (Tier 2), PR outcome tracking, evaluation pipeline (basic). - **Iteration 3e** — Memory security and integrity: input hardening (content sanitization, provenance tagging, integrity hashing), trust-aware retrieval (trust scoring, temporal decay, guardian validation), detection and response (anomaly detection, circuit breaker, quarantine, rollback), advanced protections (write-ahead validation, behavioral drift detection, cryptographic provenance, red teaming). Addresses OWASP ASI06 (Memory & Context Poisoning). - **Iteration 3bis** (hardening) — Orchestrator IAM grant for Memory (was silently AccessDenied), memory schema versioning (`schema_version: "2"`), Python repo format validation, severity-aware error logging in Python memory, narrowed entrypoint try-catch, orchestrator fallback episode observability, conditional writes in agent task_state.py (ConditionExpression guards), orchestrator Lambda error alarm (CloudWatch, retryAttempts: 0), concurrency counter reconciliation (scheduled Lambda, drift correction), multi-AZ NAT documentation (already configurable), Python unit tests (pytest), entrypoint decomposition (4 extracted subfunctions), dual prompt assembly deprecation docstring, graceful thread drain in server.py (shutdown hook + atexit), dead QUEUED state removal (8 states, 4 active). - **Iteration 4** — Additional git providers, visual proof (screenshots/videos), Slack channel, skills pipeline, user preference memory (Tier 3), control panel (restrict CORS to dashboard origin), real-time event streaming (WebSocket), live session replay and mid-task nudge, browser extension client, MFA for production. -- **Iteration 5** — Snapshot-on-schedule pre-warming, multi-user/team, memory isolation for multi-tenancy, full cost management, adaptive model router with cost-aware cascade, advanced evaluation (optional adaptive-teaching / trajectory-driven prompt patterns), formal orchestrator verification with TLA+/TLC, full Bedrock Guardrails (PII, denied topics, output filters), capability-based security model, alternate runtime, advanced customization with tiered tool access (MCP/plugins via AgentCore Gateway), full dashboard, AI-specific WAF rules. +- **Iteration 5** — Snapshot-on-schedule pre-warming, multi-user/team, memory isolation for multi-tenancy, full cost management, adaptive model router with cost-aware cascade, advanced evaluation (optional adaptive-teaching / trajectory-driven prompt patterns), formal orchestrator verification with TLA+/TLC, Bedrock Guardrails output/tool-call (PII, denied topics, output filters) — input screening in 3c, capability-based security model, alternate runtime, advanced customization with tiered tool access (MCP/plugins via AgentCore Gateway), full dashboard, AI-specific WAF rules. - **Iteration 6** — Agent swarm orchestration, skills learning, multi-repo, iterative feedback and multiplayer sessions, HITL approval, scheduled triggers, CDK constructs. Design docs to keep in sync: [ARCHITECTURE.md](../design/ARCHITECTURE.md), [ORCHESTRATOR.md](../design/ORCHESTRATOR.md), [API_CONTRACT.md](../design/API_CONTRACT.md), [INPUT_GATEWAY.md](../design/INPUT_GATEWAY.md), [REPO_ONBOARDING.md](../design/REPO_ONBOARDING.md), [MEMORY.md](../design/MEMORY.md), [OBSERVABILITY.md](../design/OBSERVABILITY.md), [COMPUTE.md](../design/COMPUTE.md), [CONTROL_PANEL.md](../design/CONTROL_PANEL.md), [SECURITY.md](../design/SECURITY.md), [EVALUATION.md](../design/EVALUATION.md). diff --git a/docs/guides/USER_GUIDE.md b/docs/guides/USER_GUIDE.md index bb96cb7..6fef6c4 100644 --- a/docs/guides/USER_GUIDE.md +++ b/docs/guides/USER_GUIDE.md @@ -203,6 +203,8 @@ curl -X POST "$API_URL/tasks" \ | `max_turns` | number | No | Maximum agent turns (1–500). Overrides the per-repo Blueprint default. Platform default: 100. | | `max_budget_usd` | number | No | Maximum cost budget in USD (0.01–100). When reached, the agent stops regardless of remaining turns. Overrides the per-repo Blueprint default. If omitted, no budget limit is applied. | +**Content screening:** Task descriptions are automatically screened by Amazon Bedrock Guardrails for prompt injection before the task is created. If content is blocked, you receive a `400 GUARDRAIL_BLOCKED` error — revise the description and retry. If the screening service is temporarily unavailable, you receive a `503` error — retry after a short delay. For PR tasks (`pr_iteration`, `pr_review`), the assembled prompt (including PR body and review comments) is also screened during context hydration; if blocked, the task transitions to `FAILED`. + **Idempotency:** Include an `Idempotency-Key` header (alphanumeric, dashes, underscores, max 128 chars) to prevent duplicate task creation on retries: ```bash @@ -524,7 +526,7 @@ The orchestrator uses Lambda Durable Functions to manage the lifecycle durably | `HYDRATING` | Orchestrator passed admission control; assembling the agent payload | | `RUNNING` | Agent session started and actively working on the task | | `COMPLETED` | Agent finished and created a PR (or determined no changes were needed) | -| `FAILED` | Agent encountered an error, or user concurrency limit was reached | +| `FAILED` | Agent encountered an error, user concurrency limit was reached, or content was blocked by guardrail screening | | `CANCELLED` | Task was cancelled by the user | | `TIMED_OUT` | Task exceeded the maximum allowed duration (~9 hours) | @@ -546,7 +548,7 @@ Each lifecycle transition is recorded as an audit event. Use the events endpoint curl "$API_URL/tasks//events" -H "Authorization: $TOKEN" ``` -Events include: `task_created`, `admission_rejected`, `preflight_failed`, `hydration_started`, `hydration_complete`, `session_started`, `pr_created`, `pr_updated`, `task_completed`, `task_failed`, `task_cancelled`, `task_timed_out`. Event records are subject to the same 90-day retention as task records and are automatically deleted after that period. +Events include: `task_created`, `admission_rejected`, `preflight_failed`, `hydration_started`, `hydration_complete`, `guardrail_blocked`, `session_started`, `pr_created`, `pr_updated`, `task_completed`, `task_failed`, `task_cancelled`, `task_timed_out`. Event records are subject to the same 90-day retention as task records and are automatically deleted after that period. ## What the agent does @@ -617,4 +619,5 @@ Filter by task ID to find logs for a specific task. - **Add a CLAUDE.md**: The agent automatically loads project-level configuration from your repository — `CLAUDE.md`, `.claude/CLAUDE.md`, `.claude/rules/*.md`, `.claude/settings.json`, `.claude/agents/`, and `.mcp.json`. Use these to provide project-specific build commands, conventions, constraints, custom subagents, and architecture notes. See the [Prompt guide](./PROMPT_GUIDE.md#repo-level-customization) for details and examples. - **Issue vs text**: When using `--issue` (CLI) or `issue_number` (API), the agent fetches the full issue body from GitHub, including any labels, comments, and linked context. This is usually better than a short text description. - **Cost**: Cost depends on the model and number of turns. Use `--max-turns` (CLI) or `max_turns` (API) to cap the number of agent iterations per task (range: 1–500). If not specified, the per-repo Blueprint default applies, falling back to the platform default (100). Use `--max-budget` (CLI) or `max_budget_usd` (API) to set a hard cost limit in USD ($0.01–$100) — when the budget is reached, the agent stops regardless of remaining turns. If no budget is specified, the per-repo Blueprint default applies; if that is also absent, no cost limit is enforced. Check the task status after completion to see the reported cost. +- **Content screening**: Task descriptions and PR context are screened by Bedrock Guardrails for prompt injection. If your task is unexpectedly blocked, check the task events (`guardrail_blocked`) for details and revise your description. - **Idempotency**: Use the `Idempotency-Key` header when creating tasks via the API to safely retry requests without creating duplicate tasks. diff --git a/docs/src/content/docs/design/Api-contract.md b/docs/src/content/docs/design/Api-contract.md index 528823d..beaa6f4 100644 --- a/docs/src/content/docs/design/Api-contract.md +++ b/docs/src/content/docs/design/Api-contract.md @@ -193,8 +193,10 @@ For `pr_iteration` and `pr_review` tasks, `branch_name` is initially set to `pen | `400` | `VALIDATION_ERROR` | Missing required fields, invalid repo format, no task description or issue or PR number, invalid `task_type`, `pr_number` provided without `task_type: 'pr_iteration'` or `'pr_review'`, `pr_number` missing when `task_type` is `pr_iteration` or `pr_review`, invalid `max_turns` (not an integer or outside 1–500 range), invalid `max_budget_usd` (not a number or outside 0.01–100 range). | | `401` | `UNAUTHORIZED` | Missing or invalid auth token. | | `409` | `DUPLICATE_TASK` | Idempotency key matches an existing task (returns the existing task in `data`). | +| `400` | `GUARDRAIL_BLOCKED` | Task description blocked by content screening (prompt injection detected). | | `422` | `REPO_NOT_ONBOARDED` | Repository is not registered with the platform. Repos are onboarded via CDK deployment (`Blueprint` construct), not via a runtime API. See [REPO_ONBOARDING.md](/design/repo-onboarding). | | `429` | `RATE_LIMIT_EXCEEDED` | User exceeded the per-user rate limit. | +| `503` | `SERVICE_UNAVAILABLE` | Content screening service temporarily unavailable. Retry with backoff. | --- @@ -583,8 +585,10 @@ HMAC verification is performed by the handler (not the authorizer) because API G | Status | Code | Condition | |---|---|---| | `400` | `VALIDATION_ERROR` | Missing required fields, invalid repo format, no task description or issue or PR number, invalid `task_type`, invalid `pr_number`, invalid `max_turns`, invalid `max_budget_usd`. | +| `400` | `GUARDRAIL_BLOCKED` | Task description blocked by content screening. | | `401` | `UNAUTHORIZED` | Missing webhook headers, webhook not found, revoked, or invalid signature. | | `409` | `DUPLICATE_TASK` | Idempotency key matches an existing task. | +| `503` | `SERVICE_UNAVAILABLE` | Content screening service temporarily unavailable. | **Channel metadata:** Tasks created via webhook record `channel_source: 'webhook'` and `channel_metadata` including `webhook_id`, `source_ip`, `user_agent`, and `api_request_id` for audit purposes. @@ -619,9 +623,10 @@ Rate limit status is communicated via response headers (see Standard response he | `REPO_NOT_ONBOARDED` | 422 | Repository is not registered with the platform. Repos are onboarded via CDK deployment, not via a runtime API. There are no `/v1/repos` endpoints. | | `PR_NOT_FOUND_OR_CLOSED` | 422 | For `pr_iteration` and `pr_review` tasks: the specified PR does not exist, is not open, or is not accessible with the configured GitHub token. Checked during the orchestrator's pre-flight step. | | `INVALID_STEP_SEQUENCE` | 500 | The blueprint's step sequence is invalid (missing required steps or incorrect ordering). This indicates a CDK configuration error that slipped past synth-time validation. Visible via `GET /v1/tasks/{id}` as `error_code`. See [REPO_ONBOARDING.md](/design/repo-onboarding#step-sequence-validation). | +| `GUARDRAIL_BLOCKED` | 400 | Task description was blocked by Bedrock Guardrail content screening (prompt injection detected). Revise the task description and retry. | | `RATE_LIMIT_EXCEEDED` | 429 | User exceeded rate limit. | | `INTERNAL_ERROR` | 500 | Unexpected server error. Includes `request_id` for support. | -| `SERVICE_UNAVAILABLE` | 503 | Downstream dependency unavailable (e.g. DynamoDB, AgentCore). Retry with backoff. | +| `SERVICE_UNAVAILABLE` | 503 | Downstream dependency unavailable (e.g. DynamoDB, AgentCore, Bedrock Guardrails). Retry with backoff. | --- diff --git a/docs/src/content/docs/design/Architecture.md b/docs/src/content/docs/design/Architecture.md index 3d3de1a..dd5f330 100644 --- a/docs/src/content/docs/design/Architecture.md +++ b/docs/src/content/docs/design/Architecture.md @@ -89,7 +89,7 @@ For the full orchestrator design — task state machine, execution model, failur The steps below are the blueprint in action: deterministic orchestration (1–2, 4) and one agentic step (3). -1. **Deterministic:** The task orchestrator runs admission control, then context hydration (task id, issue body, user message, memory context → assembled prompt). When AgentCore Memory is configured, context hydration loads repository knowledge (semantic search) and past task episodes (episodic search) in parallel and injects them into the system prompt. See [MEMORY.md](/design/memory). +1. **Deterministic:** The task orchestrator runs admission control, then context hydration (task id, issue body, user message, memory context → assembled prompt). When AgentCore Memory is configured, context hydration loads repository knowledge (semantic search) and past task episodes (episodic search) in parallel and injects them into the system prompt. For PR tasks, the assembled prompt is screened through Bedrock Guardrails for prompt injection before proceeding to session start. See [MEMORY.md](/design/memory). 2. **Deterministic:** The orchestrator starts the agent session (compute environment) and passes in the prompt. The prompt version (SHA-256 hash of deterministic prompt parts) is stored on the task record for traceability. 3. **Agentic:** The agent runs in the isolated environment: clone repo, create branch, edit code, commit often, run tests and lint, create PR. Commits are attributed via git trailers (`Task-Id`, `Prompt-Version`). At task end, the agent writes memory (task episode + repo learnings) to AgentCore Memory. The orchestrator does not execute this logic; it only waits for the session to finish. 4. **Deterministic:** The orchestrator infers the result (e.g. by querying GitHub for a PR on the agent's branch), updates task status, and finalizes (result inference, cleanup). If the agent did not write memory (crash, timeout), the orchestrator writes a fallback episode. A validation step may run here (e.g. configurable post-agent checks); see repo onboarding for customizing these steps. @@ -208,6 +208,7 @@ Each concept has a **source-of-truth document** and one or more documents that r | Live session replay | ROADMAP.md (Iter 4) | API_CONTRACT.md | | PR iteration task type | API_CONTRACT.md, ORCHESTRATOR.md | USER_GUIDE.md, PROMPT_GUIDE.md, SECURITY.md, AGENT_HARNESS.md | | PR review task type | API_CONTRACT.md, ORCHESTRATOR.md | USER_GUIDE.md, PROMPT_GUIDE.md, SECURITY.md, AGENT_HARNESS.md | +| Bedrock Guardrail input screening | SECURITY.md (Input validation and guardrails) | ORCHESTRATOR.md (Context hydration), API_CONTRACT.md (Error codes), OBSERVABILITY.md (Alarms), ROADMAP.md (3c) | ### Per-repo model selection diff --git a/docs/src/content/docs/design/Input-gateway.md b/docs/src/content/docs/design/Input-gateway.md index db89e53..c3d66eb 100644 --- a/docs/src/content/docs/design/Input-gateway.md +++ b/docs/src/content/docs/design/Input-gateway.md @@ -42,7 +42,7 @@ In short: **every input channel connects through this central point; the gateway Every channel-specific payload must be transformed into the same internal message structure. The rest of the system only ever sees this normalized form. - **Validation** - The gateway must validate normalized messages (required fields, types, allowed actions, target repo/issue refs, size limits) and reject malformed or invalid requests with clear errors. + The gateway must validate normalized messages (required fields, types, allowed actions, target repo/issue refs, size limits) and reject malformed or invalid requests with clear errors. Task descriptions are additionally screened by Amazon Bedrock Guardrails for prompt injection at submission time (fail-closed). See [SECURITY.md](/design/security). - **Access control** The gateway enforces who can do what (e.g. only the task owner can cancel; only authenticated users can create tasks). This may be defined per channel or globally. diff --git a/docs/src/content/docs/design/Observability.md b/docs/src/content/docs/design/Observability.md index 4dcde70..54bb586 100644 --- a/docs/src/content/docs/design/Observability.md +++ b/docs/src/content/docs/design/Observability.md @@ -130,6 +130,7 @@ Both are one-time, account-level setup steps — not managed by CDK. - Task creation, status transitions (SUBMITTED → HYDRATING → RUNNING → COMPLETED / FAILED / CANCELLED / TIMED_OUT), and terminal state. - **Step-level events** — The blueprint framework emits events for each pipeline step: `{step_name}_started`, `{step_name}_completed`, `{step_name}_failed`. For built-in steps these overlap with the fixed event types (e.g. `hydration_started`). For custom Lambda steps, the step name is user-defined (e.g. `sast-scan_started`, `prepare-environment_completed`). See [REPO_ONBOARDING.md](/design/repo-onboarding#blueprint-execution-framework) and [API_CONTRACT.md](/design/api-contract). +- **Guardrail screening events** — `guardrail_blocked` (content blocked by Bedrock Guardrail during hydration, with metadata: `reason`, `task_type`, `pr_number`, `sources`, `token_estimate`). Screening failures are logged with structured `metric_type` fields (not emitted as task events). - Time in each state (e.g. time in HYDRATING, time RUNNING, cold start to first agent activity). - Correlation with a task id and user id so users and operators can filter by task or user. @@ -165,6 +166,8 @@ Plans call for defining at least: - Active tasks (RUNNING count). - Pending tasks (SUBMITTED count). - Task completion rate (success vs failed/cancelled/timed out). +- Guardrail screening failure rate (`metric_type: 'guardrail_screening_failure'` in structured logs — use CloudWatch Logs Insights metric filter). +- Guardrail blocked rate (`guardrail_blocked` task events). These can be emitted as custom CloudWatch metrics (or equivalent) and used in dashboards and alarms. @@ -186,6 +189,7 @@ Critical alarms called out in the plans include: - **Orchestration / execution failures** — durable function execution failures (e.g. repeated session start failures). - **Agent crash rate** — spike or sustained high rate of agent/session failures. - **Pending depth** — SUBMITTED tasks exceeding a threshold (signals that the system is at capacity, e.g. AgentCore concurrent session quota bottleneck); may warrant a quota increase or capacity planning. +- **Guardrail screening failures** — sustained Bedrock Guardrail API failures blocking task submissions and PR task hydration (fail-closed). Filter: `metric_type = "guardrail_screening_failure"`. Indicates a Bedrock outage affecting task throughput. ## Code attribution and capture for agent search @@ -241,6 +245,7 @@ When an alarm fires, the operator should follow the corresponding procedure. The | **Orchestration failures** | 1. Check Lambda Durable Functions execution logs. 2. Identify the failing step (load-blueprint, admission-control, start-session, etc.). 3. For `INVALID_STEP_SEQUENCE`: fix the Blueprint CDK construct config and redeploy. 4. For transient failures (DynamoDB throttle, AgentCore timeout): verify service health; the durable execution should auto-retry. | | **Agent crash rate spike** | 1. Check for common root causes: model API errors (Bedrock throttling), compute quota exceeded (AgentCore session limit), image pull failures. 2. Query recent failed tasks by `error_code` for patterns. 3. If quota-related: request a quota increase or reduce concurrency limits. | | **Submitted backlog over threshold** | 1. Check system concurrency: are all slots occupied by running tasks? 2. If yes: the system is at capacity. Options: increase per-user or system-wide concurrency limits (if quota allows), or wait for running tasks to complete. 3. If no: check for orchestrator backlog (tasks in SUBMITTED state not being picked up). | +| **Guardrail screening failures** | 1. Check Bedrock service health in the AWS console. 2. Query CloudWatch Logs: `filter metric_type = "guardrail_screening_failure" | stats count() by bin(5m)`. 3. If Bedrock is down, tasks will fail at submission (503) and during hydration (FAILED). No action needed — tasks will succeed once Bedrock recovers. 4. If failures are unexpected, check guardrail configuration (`GUARDRAIL_ID`, `GUARDRAIL_VERSION` env vars on the orchestrator Lambda). | ## Deployment safety for long-running sessions diff --git a/docs/src/content/docs/design/Orchestrator.md b/docs/src/content/docs/design/Orchestrator.md index cc02bfc..e621bf4 100644 --- a/docs/src/content/docs/design/Orchestrator.md +++ b/docs/src/content/docs/design/Orchestrator.md @@ -138,7 +138,7 @@ The **target state** (Iteration 2 and beyond) introduces a durable orchestrator | `SUBMITTED` | `FAILED` | Admission rejected | Repo not onboarded, rate limit, validation failure | | `SUBMITTED` | `CANCELLED` | User cancels | Cancel request received | | `HYDRATING` | `RUNNING` | Hydration complete, session invoked | `invoke_agent_runtime` returns session ID | -| `HYDRATING` | `FAILED` | Hydration error | GitHub API failure, memory failure, prompt assembly error | +| `HYDRATING` | `FAILED` | Hydration error | GitHub API failure, memory failure, prompt assembly error, guardrail content blocked, guardrail service unavailable | | `HYDRATING` | `CANCELLED` | User cancels during hydration | Cancel request received | | `RUNNING` | `FINALIZING` | Session ends (response received or session status = terminated) | — | | `RUNNING` | `CANCELLED` | User cancels | `stop_runtime_session` called, then transition | @@ -183,7 +183,7 @@ See the Admission control section for details. Validates that the task is allowe #### Step 2: Context hydration (deterministic) -See the Context hydration section for details. Assembles the agent's prompt from multiple sources depending on task type. For `new_task`: user message, GitHub issue (title, body, comments), memory, repo configuration, and platform defaults. For `pr_iteration`: PR metadata, review comments, diff summary, and optional user instructions. An additional **pre-flight** sub-step verifies PR accessibility when `pr_number` is set (see [preflight.ts](../../cdk/src/handlers/shared/preflight.ts)). The output is a fully assembled prompt, ready to pass to the compute session. +See the Context hydration section for details. Assembles the agent's prompt from multiple sources depending on task type. For `new_task`: user message, GitHub issue (title, body, comments), memory, repo configuration, and platform defaults. For `pr_iteration`: PR metadata, review comments, diff summary, and optional user instructions. An additional **pre-flight** sub-step verifies PR accessibility when `pr_number` is set (see [preflight.ts](../../cdk/src/handlers/shared/preflight.ts)). For PR tasks, the assembled prompt is screened through Amazon Bedrock Guardrails for prompt injection before the agent receives it. The output is a fully assembled prompt, ready to pass to the compute session. #### Step 3: Session start and agent execution (deterministic start + agentic execution) @@ -275,11 +275,12 @@ The orchestrator's `hydrateAndTransition()` function calls `hydrateContext()` (` 4. **Assembles the user prompt** based on task type: - **`new_task`**: A structured markdown document with Task ID, Repository, GitHub Issue section, and Task section. The format mirrors the Python `assemble_prompt()` in `agent/entrypoint.py`. - **`pr_iteration`**: Assembled by `assemblePrIterationPrompt()` — includes PR metadata (number, title, body), the diff summary (changed files and patches), review comments (inline and conversation), and optional user instructions from `task_description`. -5. **Returns a `HydratedContext` object** containing `version`, `user_prompt`, `issue`, `sources`, `token_estimate`, `truncated`, and for `pr_iteration`/`pr_review` tasks: `resolved_branch_name` and `resolved_base_branch`. +5. **Screens through Bedrock Guardrail** (PR tasks only): For `pr_iteration` and `pr_review` tasks, the assembled user prompt is screened through Amazon Bedrock Guardrails (`screenWithGuardrail()`) using the `PROMPT_ATTACK` content filter. If the guardrail detects prompt injection, `guardrail_blocked` is set on the result and the orchestrator fails the task. If the Bedrock API is unavailable, a `GuardrailScreeningError` is thrown (fail-closed — unscreened content never reaches the agent). Task descriptions for all task types are screened at submission time in `create-task-core.ts`. +6. **Returns a `HydratedContext` object** containing `version`, `user_prompt`, `issue`, `sources`, `token_estimate`, `truncated`, and for `pr_iteration`/`pr_review` tasks: `resolved_branch_name` and `resolved_base_branch`. The hydrated context is passed to the agent as a new `hydrated_context` field in the invocation payload, alongside the existing legacy fields (`repo_url`, `task_id`, `branch_name`, `issue_number`, `prompt`). The agent checks for `hydrated_context` with `version == 1`; if present, it uses the pre-assembled `user_prompt` directly and skips in-container GitHub fetching and prompt assembly. If absent (e.g. during a deployment rollout or when the secret ARN isn't configured), the agent falls back to its existing behavior. -**Graceful degradation:** If any step fails (Secrets Manager unavailable, GitHub API error, network timeout), the orchestrator proceeds with whatever context is available. The worst case is a minimal prompt with just the task ID and repository — the agent can still attempt its own GitHub fetch as a fallback via the legacy `issue_number` field. +**Graceful degradation:** If any step fails (Secrets Manager unavailable, GitHub API error, network timeout), the orchestrator proceeds with whatever context is available. The worst case is a minimal prompt with just the task ID and repository — the agent can still attempt its own GitHub fetch as a fallback via the legacy `issue_number` field. **Exception:** `GuardrailScreeningError` is NOT caught by the fallback — it propagates to fail the task. This is intentional: unscreened content must never reach the agent (fail-closed). **PR iteration branch resolution:** After hydration, if `resolved_branch_name` is present on the hydrated context, the orchestrator updates the task record's `branch_name` in DynamoDB from the placeholder (`pending:pr_resolution`) to the PR's actual `head_ref`. This ensures the task record always reflects the real branch name that the agent will push to. @@ -289,6 +290,7 @@ The orchestrator emits two task events during hydration: - `hydration_started` — emitted when the task transitions to `HYDRATING` - `hydration_complete` — emitted after context assembly, with metadata: `sources` (array of context sources used, e.g. `["issue", "task_description"]`), `token_estimate` (estimated token count of the assembled prompt), `truncated` (whether the token budget was exceeded) +- `guardrail_blocked` — emitted when Bedrock Guardrail blocks content during hydration, with metadata: `reason`, `task_type`, `pr_number`, `sources`, `token_estimate` ### AgentCore Gateway — evaluated and deferred @@ -524,6 +526,8 @@ This section uses an FMEA (Failure Mode and Effects Analysis) approach: for each | GitHub API unavailable or rate limited | Cannot fetch issue context | Retry with backoff. If the issue is essential (issue-based task), fail the task. If the user also provided a task description, proceed with degraded context (no issue body). | | Memory service unavailable | Cannot retrieve past insights | Proceed without memory context (memory is an enrichment, not required for MVP). Log warning. | | Prompt exceeds token budget | Agent may lose coherence or fail to start | Truncate lower-priority sources (old comments, memory) to fit budget. | +| Bedrock Guardrail blocks content | Prompt injection or adversarial content detected | Task transitions to FAILED. No retry — content is adversarial. The `guardrail_blocked` event is emitted with metadata. | +| Bedrock Guardrail API unavailable | Cannot screen content (fail-closed) | Task transitions to FAILED. Operator should check Bedrock service health. Tasks will succeed once Bedrock recovers. | ### Session start failures diff --git a/docs/src/content/docs/design/Security.md b/docs/src/content/docs/design/Security.md index ef0222a..36b2445 100644 --- a/docs/src/content/docs/design/Security.md +++ b/docs/src/content/docs/design/Security.md @@ -54,7 +54,11 @@ The agent runs with **full permissions inside the sandbox** but cannot escape it - **Per-repo tool profiles:** Stored in the onboarding config and loaded by the orchestrator during context hydration. The agent harness configures the tool set based on the profile. See [REPO_ONBOARDING.md](/design/repo-onboarding) for per-repo configuration. - **Enforcement mechanism:** Tools are exposed to the agent through **AgentCore Gateway**, which provides built-in mechanisms to enforce access control. The Gateway acts as a managed proxy between the agent and external tools/APIs — only tools registered and authorized in the Gateway are reachable. Per-repo tool profiles map to Gateway tool configurations: the orchestrator registers the allowed tool set for each session, and the Gateway enforces it. This is a platform-level enforcement boundary (not a prompt-level suggestion), meaning the agent cannot bypass it by requesting tools that are not registered. For tools not mediated by the Gateway (e.g. direct bash commands), enforcement relies on the sandbox environment (filesystem permissions, network egress rules, and the bash allowlist configured in the agent harness). - **Rationale:** More tools increase the agent's search space, making behavior less predictable and harder to evaluate. A minimal default with opt-in expansion balances capability with reliability. -- **Guardrails** — Amazon Bedrock Guardrails are deployed for task input screening. The `task-input-guardrail` applies a `PROMPT_ATTACK` content filter at `HIGH` strength on task descriptions at submission time. This provides a first layer of defense against prompt injection in user-supplied task descriptions. +- **Guardrails** — Amazon Bedrock Guardrails are deployed for task input screening. The `task-input-guardrail` applies a `PROMPT_ATTACK` content filter at `HIGH` strength on task descriptions at submission time. This provides a first layer of defense against prompt injection in user-supplied task descriptions. A second screening point runs during context hydration for PR tasks (`pr_iteration`, `pr_review`), screening the assembled prompt (PR body, review comments, conversation comments, diff summary, task description) before the agent receives it. Both screening points follow a **fail-closed** pattern: if the Bedrock Guardrail API is unavailable, the task is rejected (submission-time returns HTTP 503; hydration-time transitions the task to FAILED). This ensures unscreened content never reaches the agent, even during Bedrock outages. Screening failures are logged with a structured `metric_type: 'guardrail_screening_failure'` field for CloudWatch alerting: + ``` + filter metric_type = "guardrail_screening_failure" | stats count() by bin(5m) + ``` + Operators should create a CloudWatch Logs Insights metric filter or alarm on this field to detect sustained Bedrock outages affecting task throughput. - **Task description length limit** — Task descriptions are capped at 2,000 characters to bound the attack surface for prompt injection and reduce the risk of resource exhaustion from oversized payloads. ## Blueprint custom steps trust boundary @@ -199,11 +203,11 @@ AgentCore Memory has **no native backup mechanism**. This is a significant gap f ## Known limitations - **Single GitHub OAuth token** — one token may be shared for all users and repos the platform can access. Any authenticated user can trigger agent work against any repo that token can access. There is no per-user repo scoping. -- **Guardrails are input-only** — the `PROMPT_ATTACK` filter screens task descriptions at submission. No guardrails are applied to model output during agent execution or to review feedback entering the memory system. For `pr_iteration` and `pr_review` tasks, the PR context (review comments, diff, PR body, issue comments) fetched during context hydration is **not** screened by Bedrock Guardrails — it bypasses the submission-time filter entirely. A `pr_iteration` or `pr_review` task submitted without a `task_description` receives no guardrail screening at all. +- **Guardrails are input-only** — the `PROMPT_ATTACK` filter screens task descriptions at submission and assembled PR prompts during context hydration. No guardrails are applied to model output during agent execution or to review feedback entering the memory system. For `pr_iteration` and `pr_review` tasks, the assembled user prompt (including PR body, review comments, conversation comments, diff summary, and task description) is screened through the Bedrock Guardrail during hydration; if blocked, the task fails with a descriptive error. Guardrail screening follows a fail-closed pattern: a Bedrock outage blocks task submissions (HTTP 503) and fails PR tasks during hydration. - **No memory content validation** — retrieved memory records are injected into the agent's context without sanitization, injection pattern scanning, or trust scoring. This is the most critical memory security gap (OWASP ASI06). See [MEMORY.md](/design/memory#memory-security-analysis) for the full gap analysis and [ROADMAP.md Iteration 3e](/roadmap/roadmap) for the remediation plan. - **No memory provenance or integrity checking** — memory entries carry no source attribution, content hashing, or trust metadata. The system cannot distinguish agent-generated memory from externally-influenced content. -- **GitHub issue content as untrusted input** — issue bodies and comments (attacker-controlled) are injected into the agent's context during hydration without trust differentiation. -- **PR review comments as untrusted input** — for `pr_iteration` and `pr_review` tasks, review comments, PR body, and conversation comments are fetched and injected into the agent's context. These are attacker-controlled inputs subject to the same prompt injection risks as issue comments. Unlike task descriptions, PR context is not screened by the Bedrock Guardrails `PROMPT_ATTACK` filter. For `pr_review` tasks, defense-in-depth mitigates the risk: the agent runs without `Write` or `Edit` tools, so even if prompt injection succeeds, the agent cannot modify files or push code. +- **GitHub issue content as untrusted input** — issue bodies and comments (attacker-controlled) are injected into the agent's context during hydration for `new_task` tasks without guardrail screening. Only the user-supplied `task_description` is screened at submission time; the fetched issue content bypasses both screening points. This is a known gap — extending hydration-time guardrail screening to `new_task` issue content is planned as a follow-up. +- **PR review comments as untrusted input** — for `pr_iteration` and `pr_review` tasks, review comments, PR body, and conversation comments are fetched and injected into the agent's context. These are attacker-controlled inputs subject to the same prompt injection risks as issue comments. The assembled PR prompt is now screened by the Bedrock Guardrails `PROMPT_ATTACK` filter during context hydration; if prompt injection is detected, the task fails before reaching the agent. For `pr_review` tasks, additional defense-in-depth mitigates residual risk: the agent runs without `Write` or `Edit` tools, so even if injection bypasses the guardrail, the agent cannot modify files or push code. - **No memory rollback or quarantine** — the 365-day AgentCore Memory expiration is the only cleanup mechanism. There is no snapshot, rollback, or quarantine capability for suspected poisoned entries. - **No MFA** — Cognito MFA is disabled (CLI-based auth flow). Should be enabled for production deployments. - **No customer-managed KMS** — all encryption at rest uses AWS-managed keys. Customer-managed KMS can be added if required by compliance policy. diff --git a/docs/src/content/docs/developer-guide/Project-structure.md b/docs/src/content/docs/developer-guide/Project-structure.md index 4679f76..003f6a3 100644 --- a/docs/src/content/docs/developer-guide/Project-structure.md +++ b/docs/src/content/docs/developer-guide/Project-structure.md @@ -47,7 +47,7 @@ cdk/src/ │ ├── webhook-create-task.ts # POST /webhooks/tasks Lambda (HMAC-SHA256 verification) │ └── shared/ │ ├── create-task-core.ts # Shared task creation logic (Cognito + webhook) -│ ├── context-hydration.ts # GitHub issue fetching, prompt assembly, token budget +│ ├── context-hydration.ts # GitHub issue fetching, prompt assembly, token budget, guardrail screening │ ├── gateway.ts # User extraction, webhook context, branch naming │ ├── logger.ts # Structured logger │ ├── orchestrator.ts # Orchestrator step helpers (DDB, AgentCore, concurrency) diff --git a/docs/src/content/docs/roadmap/Roadmap.md b/docs/src/content/docs/roadmap/Roadmap.md index 3e8b1d6..453d76d 100644 --- a/docs/src/content/docs/roadmap/Roadmap.md +++ b/docs/src/content/docs/roadmap/Roadmap.md @@ -165,6 +165,7 @@ These practices apply continuously across iterations and are not treated as one- - **Tier 3 — Risk and blast radius analysis** — Analyze the scope and impact of the agent's changes to detect unintended side effects in other parts of the codebase. Includes: dependency graph analysis (what modules/functions consume the changed code), change surface area (number of files, lines, and modules touched), semantic impact assessment (does the change alter public APIs, shared types, configuration, or database schemas), and regression risk scoring. Produces a **risk level** (low / medium / high / critical) attached to the PR as a label and included in the validation report. High-risk changes may require explicit human approval before merge (foundation for the HITL approval mode in Iteration 6). The risk level considers: number of downstream dependents affected, whether the change touches shared infrastructure or core abstractions, test coverage of the affected area, and whether the change introduces new external dependencies. - **PR risk level and validation report** — Every agent-created PR includes a structured **validation report** (as a PR comment or check run) summarizing: Tier 1 results (pass/fail per tool), Tier 2 findings (code quality issues by severity), Tier 3 risk assessment (risk level, blast radius summary, affected modules). The PR is labeled with the computed risk level (`risk:low`, `risk:medium`, `risk:high`, `risk:critical`). Risk level is persisted in the task record for evaluation and trending. See [EVALUATION.md](/design/evaluation#pr-risk-level). - [x] **Other task types: PR review and PR-iteration** — Support additional task types beyond "implement from issue": **iterate on pull request** (`pr_iteration`) reads review comments and addresses them (implement changes, push updates, post summary). **Review pull request** (`pr_review`) is a read-only task type where the agent analyzes a PR's changes and posts structured review comments via the GitHub Reviews API. The `pr_review` agent runs without `Write` or `Edit` tools (defense-in-depth), skips `ensure_committed` and push, and treats build status as informational only. Each review comment uses a structured format: type (comment/question/issue/good_point), severity for issues (minor/medium/major/critical), title, description with memory attribution, proposed fix, and a ready-to-use AI prompt. The CLI exposes `--review-pr ` (mutually exclusive with `--pr`). +- [x] **Input guardrail screening (Bedrock Guardrails)** — Amazon Bedrock Guardrails screen task descriptions at submission time and assembled PR prompts during context hydration (`pr_iteration`, `pr_review`). Uses `PROMPT_ATTACK` content filter at `HIGH` strength. Fail-closed: Bedrock outages block tasks rather than letting unscreened content through. See [SECURITY.md](/design/security). - **Multi-modal input** — Accept text and images (or other modalities) in the task payload; pass through to the agent. Gateway and schema support it; agent harness supports it where available. Primary use case: screenshots of bugs, UI mockups, or design specs attached to issues. **Builds on Iteration 3b:** Memory is operational; this iteration changes the orchestrator blueprint (tiered validation pipeline, new task type) and broadens the input schema. These are independently testable from memory. @@ -259,7 +260,7 @@ Deep research identified **9 memory-layer security gaps** in the current archite - **Adaptive model router with cost-aware cascade** — Per-turn model selection via a lightweight heuristic engine. File reads and simple edits use a cheaper model (Haiku); multi-file refactors use Sonnet; complex reasoning escalates to Opus. Error escalation: if the agent fails twice on the same step, upgrade model for the retry. As the cost budget ceiling approaches, cascade down to cheaper models. Blueprint `modelCascade` config enables per-repo tuning. Potential 30-40% cost reduction on inference-dominated workloads. Requires agent harness changes to support mid-session model switching. - **Advanced evaluation and feedback loop** — Extend the basic evaluation pipeline from Iteration 3d: ML-based or LLM-based trace analysis (not just rules), A/B prompt comparison framework, automated feedback into prompt templates (e.g. "for repo X, always run tests before opening PR"), and per-repo or per-failure-type improvement tracking. Evaluation results can update the repo's agent configuration stored during onboarding. **Optional patterns from adaptive teaching research** (e.g. plan → targeted critique → execution; separate **evaluator** vs **prompt/reflection** roles; fitness from LLM judging plus efficiency metrics; evolution of teaching templates from failed trajectories with Pareto-style candidate sets for diverse failure modes) can inform offline or scheduled improvement of Blueprint prompts and checklists without replacing ABCA's core orchestrator. - **Formal orchestrator verification (TLA+)** — Add a formal specification of the orchestrator in TLA+ and verify it with TLC model checking. Scope includes the task state machine (8 states, valid transitions, terminal states), concurrency admission control (atomic increment + max check), cancellation races (cancel arriving during any orchestration step), reconciler/orchestrator interleavings (counter drift correction while tasks are active), and the polling loop (agent writes terminal status, orchestrator observes and finalizes). Define invariants such as valid-state progression, no illegal transitions, and repo-level safety constraints (for example, at most one active `RUNNING` task per repo when configured). Keep the spec aligned with `src/constructs/task-status.ts` and orchestrator docs so regressions surface as model-check counterexamples before production. -- **Guardrails** — Natural-language or policy-based **guardrails** on agent tool calls using Amazon Bedrock Guardrails. Defends against prompt injection, restricts sensitive content generation, and enforces organizational policies (e.g. "do not modify files in `/infrastructure`"). See [SECURITY.md](/design/security). Guardrails configuration can be per-repo (via onboarding) or platform-wide. +- **Guardrails (output and tool-call)** — Extend Bedrock Guardrails from input screening (implemented in Iteration 3c) to **output filtering** and **agent tool-call guardrails**. Apply content filters to model responses during agent execution, restrict sensitive content generation, and enforce organizational policies (e.g. "do not modify files in `/infrastructure`"). See [SECURITY.md](/design/security). Guardrails configuration can be per-repo (via onboarding) or platform-wide. - **Capability-based security model** — Fine-grained enforcement beyond Bedrock Guardrails, operating at three levels: (1) **Tool-level capabilities** — Bash command allowlist (git, npm, make permitted; curl, wget blocked), configurable per capability tier (standard / elevated / read-only). (2) **File-system scope** — Blueprint declares include/exclude path patterns; Write/Edit/Read tools are filtered to the declared scope. (3) **Input trust scoring** — Authenticated user input = trusted; external GitHub issues = untrusted; PR review comments entering memory = adversarial. Trust level selects the capability set. Essential once review feedback memory (Iter 3d) introduces attacker-controlled content into the agent's context. Blueprint `security` prop configures the capability profile per repo. - **Additional execution environment** — Support an alternative to AgentCore Runtime (e.g. ECS/Fargate, EKS) behind the **ComputeStrategy** interface (see [REPO_ONBOARDING.md](/design/repo-onboarding#compute-strategy-interface)). The orchestrator calls abstract methods (`startSession`, `stopSession`, `pollSession`); the implementation maps to AgentCore, Fargate, or EKS. Repos select the strategy via `compute_type` in their blueprint configuration. Reduces vendor lock-in and enables workloads that exceed AgentCore limits (e.g. GPU, larger images, longer sessions). The ComputeStrategy interface contract is defined in Iteration 3a; Iteration 5 adds alternative implementations. - **Full web dashboard** — Extend the control panel from Iteration 4: detailed dashboards (cost, performance, evaluation), reasoning trace viewer or log explorer (linked to OpenTelemetry traces from AgentCore), task submit/cancel from the UI, and admin views (system health, capacity, user management). @@ -292,12 +293,12 @@ Deep research identified **9 memory-layer security gaps** in the current archite - **Iteration 2** — Production orchestrator, API contract, task management (list/status/cancel), durable execution, observability, threat model, network isolation, basic cost guardrails, CI/CD. - **Iteration 3a** — Repo onboarding, DNS Firewall (domain-level egress filtering), webhook trigger, GitHub Actions, per-repo customization (prompt from repo), data retention, turn/iteration caps, cost budget caps, user prompt guide, agent harness improvements (turn budget, default branch, safety net, lint, softened conventions), operator dashboard, WAF, model invocation logging, input length limits. - **Iteration 3b** ✅ — Memory Tier 1 (repo knowledge, task episodes), insights, agent self-feedback, prompt versioning, per-prompt commit attribution. CDK L2 construct with named semantic + episodic strategies using namespace templates (`/{actorId}/knowledge/`, `/{actorId}/episodes/{sessionId}/`), fail-open memory load/write, orchestrator fallback episode, SHA-256 prompt hashing, git trailer attribution. -- **Iteration 3c** — Per-repo GitHub App credentials, orchestrator pre-flight checks (fail-closed before session start), persistent session storage for select caches (AgentCore Runtime `/mnt/workspace` mount for npm/Claude config; mise/uv/repo on local disk due to FUSE `flock()` limitation), pre-execution task risk classification (model/limits/approval policy selection), tiered validation pipeline (tool validation, code quality analysis, post-execution risk/blast radius analysis), PR risk level, PR review task type (`pr_review` — read-only structured review with tool restriction, defense-in-depth enforcement, CLI `--review-pr` flag), multi-modal input. +- **Iteration 3c** — Per-repo GitHub App credentials, orchestrator pre-flight checks (fail-closed before session start), persistent session storage for select caches (AgentCore Runtime `/mnt/workspace` mount for npm/Claude config; mise/uv/repo on local disk due to FUSE `flock()` limitation), pre-execution task risk classification (model/limits/approval policy selection), tiered validation pipeline (tool validation, code quality analysis, post-execution risk/blast radius analysis), PR risk level, PR review task type (`pr_review` — read-only structured review with tool restriction, defense-in-depth enforcement, CLI `--review-pr` flag), input guardrail screening (Bedrock Guardrails, fail-closed), multi-modal input. - **Iteration 3d** — Review feedback memory loop (Tier 2), PR outcome tracking, evaluation pipeline (basic). - **Iteration 3e** — Memory security and integrity: input hardening (content sanitization, provenance tagging, integrity hashing), trust-aware retrieval (trust scoring, temporal decay, guardian validation), detection and response (anomaly detection, circuit breaker, quarantine, rollback), advanced protections (write-ahead validation, behavioral drift detection, cryptographic provenance, red teaming). Addresses OWASP ASI06 (Memory & Context Poisoning). - **Iteration 3bis** (hardening) — Orchestrator IAM grant for Memory (was silently AccessDenied), memory schema versioning (`schema_version: "2"`), Python repo format validation, severity-aware error logging in Python memory, narrowed entrypoint try-catch, orchestrator fallback episode observability, conditional writes in agent task_state.py (ConditionExpression guards), orchestrator Lambda error alarm (CloudWatch, retryAttempts: 0), concurrency counter reconciliation (scheduled Lambda, drift correction), multi-AZ NAT documentation (already configurable), Python unit tests (pytest), entrypoint decomposition (4 extracted subfunctions), dual prompt assembly deprecation docstring, graceful thread drain in server.py (shutdown hook + atexit), dead QUEUED state removal (8 states, 4 active). - **Iteration 4** — Additional git providers, visual proof (screenshots/videos), Slack channel, skills pipeline, user preference memory (Tier 3), control panel (restrict CORS to dashboard origin), real-time event streaming (WebSocket), live session replay and mid-task nudge, browser extension client, MFA for production. -- **Iteration 5** — Snapshot-on-schedule pre-warming, multi-user/team, memory isolation for multi-tenancy, full cost management, adaptive model router with cost-aware cascade, advanced evaluation (optional adaptive-teaching / trajectory-driven prompt patterns), formal orchestrator verification with TLA+/TLC, full Bedrock Guardrails (PII, denied topics, output filters), capability-based security model, alternate runtime, advanced customization with tiered tool access (MCP/plugins via AgentCore Gateway), full dashboard, AI-specific WAF rules. +- **Iteration 5** — Snapshot-on-schedule pre-warming, multi-user/team, memory isolation for multi-tenancy, full cost management, adaptive model router with cost-aware cascade, advanced evaluation (optional adaptive-teaching / trajectory-driven prompt patterns), formal orchestrator verification with TLA+/TLC, Bedrock Guardrails output/tool-call (PII, denied topics, output filters) — input screening in 3c, capability-based security model, alternate runtime, advanced customization with tiered tool access (MCP/plugins via AgentCore Gateway), full dashboard, AI-specific WAF rules. - **Iteration 6** — Agent swarm orchestration, skills learning, multi-repo, iterative feedback and multiplayer sessions, HITL approval, scheduled triggers, CDK constructs. Design docs to keep in sync: [ARCHITECTURE.md](/design/architecture), [ORCHESTRATOR.md](/design/orchestrator), [API_CONTRACT.md](/design/api-contract), [INPUT_GATEWAY.md](/design/input-gateway), [REPO_ONBOARDING.md](/design/repo-onboarding), [MEMORY.md](/design/memory), [OBSERVABILITY.md](/design/observability), [COMPUTE.md](/design/compute), [CONTROL_PANEL.md](/design/control-panel), [SECURITY.md](/design/security), [EVALUATION.md](/design/evaluation). diff --git a/docs/src/content/docs/user-guide/Task-lifecycle.md b/docs/src/content/docs/user-guide/Task-lifecycle.md index ddfc787..da99c79 100644 --- a/docs/src/content/docs/user-guide/Task-lifecycle.md +++ b/docs/src/content/docs/user-guide/Task-lifecycle.md @@ -20,7 +20,7 @@ The orchestrator uses Lambda Durable Functions to manage the lifecycle durably | `HYDRATING` | Orchestrator passed admission control; assembling the agent payload | | `RUNNING` | Agent session started and actively working on the task | | `COMPLETED` | Agent finished and created a PR (or determined no changes were needed) | -| `FAILED` | Agent encountered an error, or user concurrency limit was reached | +| `FAILED` | Agent encountered an error, user concurrency limit was reached, or content was blocked by guardrail screening | | `CANCELLED` | Task was cancelled by the user | | `TIMED_OUT` | Task exceeded the maximum allowed duration (~9 hours) | @@ -42,4 +42,4 @@ Each lifecycle transition is recorded as an audit event. Use the events endpoint curl "$API_URL/tasks//events" -H "Authorization: $TOKEN" ``` -Events include: `task_created`, `admission_rejected`, `preflight_failed`, `hydration_started`, `hydration_complete`, `session_started`, `pr_created`, `pr_updated`, `task_completed`, `task_failed`, `task_cancelled`, `task_timed_out`. Event records are subject to the same 90-day retention as task records and are automatically deleted after that period. \ No newline at end of file +Events include: `task_created`, `admission_rejected`, `preflight_failed`, `hydration_started`, `hydration_complete`, `guardrail_blocked`, `session_started`, `pr_created`, `pr_updated`, `task_completed`, `task_failed`, `task_cancelled`, `task_timed_out`. Event records are subject to the same 90-day retention as task records and are automatically deleted after that period. \ No newline at end of file diff --git a/docs/src/content/docs/user-guide/Tips.md b/docs/src/content/docs/user-guide/Tips.md index 55b84c2..755604e 100644 --- a/docs/src/content/docs/user-guide/Tips.md +++ b/docs/src/content/docs/user-guide/Tips.md @@ -7,4 +7,5 @@ title: Tips - **Add a CLAUDE.md**: The agent automatically loads project-level configuration from your repository — `CLAUDE.md`, `.claude/CLAUDE.md`, `.claude/rules/*.md`, `.claude/settings.json`, `.claude/agents/`, and `.mcp.json`. Use these to provide project-specific build commands, conventions, constraints, custom subagents, and architecture notes. See the [Prompt guide](/user-guide/prompt-guide#repo-level-customization) for details and examples. - **Issue vs text**: When using `--issue` (CLI) or `issue_number` (API), the agent fetches the full issue body from GitHub, including any labels, comments, and linked context. This is usually better than a short text description. - **Cost**: Cost depends on the model and number of turns. Use `--max-turns` (CLI) or `max_turns` (API) to cap the number of agent iterations per task (range: 1–500). If not specified, the per-repo Blueprint default applies, falling back to the platform default (100). Use `--max-budget` (CLI) or `max_budget_usd` (API) to set a hard cost limit in USD ($0.01–$100) — when the budget is reached, the agent stops regardless of remaining turns. If no budget is specified, the per-repo Blueprint default applies; if that is also absent, no cost limit is enforced. Check the task status after completion to see the reported cost. +- **Content screening**: Task descriptions and PR context are screened by Bedrock Guardrails for prompt injection. If your task is unexpectedly blocked, check the task events (`guardrail_blocked`) for details and revise your description. - **Idempotency**: Use the `Idempotency-Key` header when creating tasks via the API to safely retry requests without creating duplicate tasks. \ No newline at end of file diff --git a/docs/src/content/docs/user-guide/Using-the-rest-api.md b/docs/src/content/docs/user-guide/Using-the-rest-api.md index 526c7b6..f35021c 100644 --- a/docs/src/content/docs/user-guide/Using-the-rest-api.md +++ b/docs/src/content/docs/user-guide/Using-the-rest-api.md @@ -96,6 +96,8 @@ curl -X POST "$API_URL/tasks" \ | `max_turns` | number | No | Maximum agent turns (1–500). Overrides the per-repo Blueprint default. Platform default: 100. | | `max_budget_usd` | number | No | Maximum cost budget in USD (0.01–100). When reached, the agent stops regardless of remaining turns. Overrides the per-repo Blueprint default. If omitted, no budget limit is applied. | +**Content screening:** Task descriptions are automatically screened by Amazon Bedrock Guardrails for prompt injection before the task is created. If content is blocked, you receive a `400 GUARDRAIL_BLOCKED` error — revise the description and retry. If the screening service is temporarily unavailable, you receive a `503` error — retry after a short delay. For PR tasks (`pr_iteration`, `pr_review`), the assembled prompt (including PR body and review comments) is also screened during context hydration; if blocked, the task transitions to `FAILED`. + **Idempotency:** Include an `Idempotency-Key` header (alphanumeric, dashes, underscores, max 128 chars) to prevent duplicate task creation on retries: ```bash