From 5cd7f7620f0fae223750f94e43c7e26baae0ae43 Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Fri, 10 Apr 2026 10:32:50 -0500 Subject: [PATCH 01/11] feat(merges): add dismiss actions for failed beads on Merge Queue page (#2295) * feat(merges): add dismiss actions for failed beads on Merge Queue page - Add individual Dismiss (X) button to each failed bead row in AttentionItemRow - Add bulk 'Dismiss all failed (N)' button to NeedsAttention header area - Fix view button fallback: open MR bead when sourceBead is null (orphaned beads) - Both individual and bulk dismiss call updateBead with status: 'closed' on the MR bead - Dismiss all shows loading spinner and toast on completion/error * fix(merges): use rigId directly in openDrawer to fix TS typecheck --------- Co-authored-by: John Fawcett --- .../[townId]/merges/NeedsAttention.tsx | 140 ++++++++++++++---- 1 file changed, 111 insertions(+), 29 deletions(-) diff --git a/apps/web/src/app/(app)/gastown/[townId]/merges/NeedsAttention.tsx b/apps/web/src/app/(app)/gastown/[townId]/merges/NeedsAttention.tsx index a7fc67031..cdd73bc7b 100644 --- a/apps/web/src/app/(app)/gastown/[townId]/merges/NeedsAttention.tsx +++ b/apps/web/src/app/(app)/gastown/[townId]/merges/NeedsAttention.tsx @@ -1,6 +1,6 @@ 'use client'; -import { useState, useMemo, Fragment } from 'react'; +import { useState, useMemo, Fragment, useCallback } from 'react'; import { useMutation, useQueryClient } from '@tanstack/react-query'; import { useSession } from 'next-auth/react'; import { useGastownTRPC } from '@/lib/gastown/trpc'; @@ -15,7 +15,9 @@ import { Eye, GitBranch, GitMerge, + Loader2, RefreshCw, + X, XCircle, CheckCircle2, Clock, @@ -113,6 +115,8 @@ export function NeedsAttention({ }) { const session = useSession(); const isAdmin = session?.data?.isAdmin ?? false; + const trpc = useGastownTRPC(); + const queryClient = useQueryClient(); const totalCount = data.openPRs.length + data.failedReviews.length + data.stalePRs.length; // Tag each item with its category for rendering @@ -139,6 +143,39 @@ export function NeedsAttention({ return map; }, [allItems]); + const failedItems = useMemo( + () => allItems.filter(({ category }) => category === 'failed').map(({ item }) => item), + [allItems] + ); + + const [isDismissingAll, setIsDismissingAll] = useState(false); + const updateBeadMutation = useMutation(trpc.gastown.updateBead.mutationOptions({})); + + const dismissAllFailed = useCallback(async () => { + if (failedItems.length === 0) return; + setIsDismissingAll(true); + try { + await Promise.all( + failedItems.map(item => + updateBeadMutation.mutateAsync({ + rigId: item.mrBead.rig_id ?? '', + beadId: item.mrBead.bead_id, + status: 'closed', + }) + ) + ); + void queryClient.invalidateQueries({ + queryKey: trpc.gastown.getMergeQueueData.queryKey({ townId }), + }); + toast.success(`Dismissed ${failedItems.length} failed ${failedItems.length === 1 ? 'bead' : 'beads'}`); + } catch (err: unknown) { + const message = err instanceof Error ? err.message : 'Unknown error'; + toast.error(`Failed to dismiss all: ${message}`); + } finally { + setIsDismissingAll(false); + } + }, [failedItems, updateBeadMutation, queryClient, trpc, townId]); + if (totalCount === 0) { return (
@@ -150,6 +187,24 @@ export function NeedsAttention({ return (
+ {/* Dismiss all failed button */} + {failedItems.length > 0 && ( +
+ +
+ )} + {/* Convoy groups */} {convoyGroups.map(group => ( @@ -335,6 +390,18 @@ function AttentionItemRow({ }) ); + const dismissMutation = useMutation( + trpc.gastown.updateBead.mutationOptions({ + onSuccess: () => { + invalidateMergeQueue(); + toast.success('Bead dismissed'); + }, + onError: (err: { message: string }) => { + toast.error(`Failed to dismiss: ${err.message}`); + }, + }) + ); + // Fail bead mutation: use adminForceFailBead const failMutation = useMutation( trpc.gastown.adminForceFailBead.mutationOptions({ @@ -349,7 +416,7 @@ function AttentionItemRow({ }) ); - const isPending = retryMutation.isPending || failMutation.isPending; + const isPending = retryMutation.isPending || failMutation.isPending || dismissMutation.isPending; const handleConfirm = () => { if (!confirmAction) return; @@ -388,13 +455,12 @@ function AttentionItemRow({ + <> + + + )}
+ {/* ── Debug ──────────────────────────────────────────── */} + +
+

+ Copies a JSON snapshot of your town configuration for troubleshooting. API + tokens, email addresses, and custom instruction contents are excluded. +

+ +
+
+ {/* ── Danger Zone ──────────────────────────────────────── */}
From 200c292af11d6d2515eab3be7ae3cf6650bb4045 Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Fri, 10 Apr 2026 16:48:23 -0500 Subject: [PATCH 03/11] chore(gastown): remove dead popReviewQueue and update stale comments (#2318) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * chore(gastown): remove dead popReviewQueue and update stale comments Remove popReviewQueue() from review-queue.ts and its wrapper from Town.do.ts — confirmed no callers in the tRPC router, reconciler, or anywhere else. Also remove the Town.do.ts completeReview() wrapper (also had no external callers) and update stale comments across review-queue.ts, Town.do.ts, reconciler.ts, beads.ts, and container-dispatch.ts that referenced old patrol/scheduling functions (feedStrandedConvoys, rehookOrphanedBeads, schedulePendingWork, recoverStuckReviews, witnessPatrol, processReviewQueue) to reflect the current reconciler-based architecture. Closes #1403 * ci: retrigger Kilo Code Review (previous run failed due to transient clone error) * test(gastown): update integration tests to remove removed popReviewQueue/completeReview APIs popReviewQueue() and completeReview() were removed from TownDO as dead code. Update integration tests to use listBeads({ type: 'merge_request' }) instead of popReviewQueue() to observe MR bead state, and remove the regression guard test for completeReview which is no longer testable via the TownDO public API. --------- Co-authored-by: John Fawcett --- services/gastown/src/dos/Town.do.ts | 23 ++--- services/gastown/src/dos/town/beads.ts | 4 +- .../src/dos/town/container-dispatch.ts | 8 +- services/gastown/src/dos/town/reconciler.ts | 4 +- services/gastown/src/dos/town/review-queue.ts | 83 ++++--------------- .../test/integration/review-failure.test.ts | 23 ----- .../test/integration/rig-alarm.test.ts | 14 ++-- .../gastown/test/integration/rig-do.test.ts | 68 +++++---------- 8 files changed, 61 insertions(+), 166 deletions(-) diff --git a/services/gastown/src/dos/Town.do.ts b/services/gastown/src/dos/Town.do.ts index 75a654ccf..f274cebcf 100644 --- a/services/gastown/src/dos/Town.do.ts +++ b/services/gastown/src/dos/Town.do.ts @@ -1670,14 +1670,6 @@ export class TownDO extends DurableObject { await this.escalateToActiveCadence(); } - async popReviewQueue(): Promise { - return reviewQueue.popReviewQueue(this.sql); - } - - async completeReview(entryId: string, status: 'merged' | 'failed'): Promise { - reviewQueue.completeReview(this.sql, entryId, status); - } - async completeReviewWithResult(input: { entry_id: string; status: 'merged' | 'failed' | 'conflict'; @@ -1712,10 +1704,9 @@ export class TownDO extends DurableObject { }); } - // Rework is handled by the normal scheduling path: the failed/conflict + // Rework is handled by the reconciler's scheduling path: the failed/conflict // path in completeReviewWithResult sets the source bead to 'open' with - // assignee cleared. feedStrandedConvoys or rehookOrphanedBeads will - // hook a polecat, and schedulePendingWork will dispatch it. + // assignee cleared. The reconciler will hook a polecat and dispatch it. } async agentDone(agentId: string, input: AgentDoneInput): Promise { @@ -3558,9 +3549,9 @@ export class TownDO extends DurableObject { } // ── Pre-phase: Observe container status for working agents ──────── - // Replaces witnessPatrol's zombie detection. Poll the container for - // each working/stalled agent and emit container_status events. These - // are drained in Phase 0 and applied before reconciliation. + // Poll the container for each working/stalled agent and emit + // container_status events. These are drained in Phase 0 and applied + // before reconciliation. try { const workingAgentRows = z .object({ bead_id: z.string() }) @@ -4487,8 +4478,8 @@ export class TownDO extends DurableObject { // Only count idle+hooked agents as orphaned if they've been idle for // longer than the dispatch cooldown. Agents that were just hooked by - // feedStrandedConvoys or restarted with backoff are legitimately - // waiting for the next scheduler tick. + // the reconciler or restarted with backoff are legitimately waiting + // for the next scheduler tick. const orphanedHooks = Number( [ ...query( diff --git a/services/gastown/src/dos/town/beads.ts b/services/gastown/src/dos/town/beads.ts index 25d76d8a3..1f1409c33 100644 --- a/services/gastown/src/dos/town/beads.ts +++ b/services/gastown/src/dos/town/beads.ts @@ -421,8 +421,8 @@ export function updateConvoyProgress(sql: SqlStorage, beadId: string, timestamp: if (featureBranch && mergeMode === 'review-then-land') { // Mark the convoy as ready to land by storing a flag in metadata. - // The alarm loop's processReviewQueue will detect this and create - // the final landing MR (feature branch → main). + // The reconciler will detect this and create the final landing + // MR (feature branch → main). query( sql, /* sql */ ` diff --git a/services/gastown/src/dos/town/container-dispatch.ts b/services/gastown/src/dos/town/container-dispatch.ts index e113bc417..f89300abb 100644 --- a/services/gastown/src/dos/town/container-dispatch.ts +++ b/services/gastown/src/dos/town/container-dispatch.ts @@ -647,12 +647,12 @@ export async function checkAgentContainerStatus( signal: AbortSignal.timeout(10_000), }); // 404 means the container is running but has no record of this agent - // (e.g. after container eviction). Report as 'not_found' so - // witnessPatrol can immediately reset and redispatch the agent + // (e.g. after container eviction). Report as 'not_found' so the + // reconciler can immediately reset and redispatch the agent // instead of waiting for the 2-hour GUPP timeout. if (response.status === 404) return { status: 'not_found' }; // Non-OK but not 404 — container is having issues but may still - // have the agent running. Return 'unknown' so witnessPatrol doesn't + // have the agent running. Return 'unknown' so the reconciler doesn't // falsely reset a working agent. if (!response.ok) return { status: 'unknown' }; const data: unknown = await response.json(); @@ -668,7 +668,7 @@ export async function checkAgentContainerStatus( return { status: 'unknown' }; } catch { // Timeout, network error, or container starting up — return - // 'unknown' so witnessPatrol doesn't falsely reset working agents. + // 'unknown' so the reconciler doesn't falsely reset working agents. // True zombies will be caught after repeated 'unknown' results // once the GIPP/heartbeat timeout expires. return { status: 'unknown' }; diff --git a/services/gastown/src/dos/town/reconciler.ts b/services/gastown/src/dos/town/reconciler.ts index a7bfcc9ac..111473fd4 100644 --- a/services/gastown/src/dos/town/reconciler.ts +++ b/services/gastown/src/dos/town/reconciler.ts @@ -539,7 +539,7 @@ export function reconcileAgents(sql: SqlStorage, opts?: { draining?: boolean }): // Agent is working with fresh heartbeat but no hook — it's running // in the container but has no bead to work on (gt_done already ran, // or the hook was cleared by another code path). Set to idle so - // processReviewQueue / schedulePendingWork can use it. + // the reconciler can dispatch it to new work. actions.push({ type: 'transition_agent', agent_id: agent.bead_id, @@ -810,7 +810,7 @@ export function reconcileBeads( }); } - // Rule 2: Idle agents with hooks need dispatch (schedulePendingWork equivalent) + // Rule 2: Idle agents with hooks need dispatch const idleHooked = AgentRow.array().parse([ ...query( sql, diff --git a/services/gastown/src/dos/town/review-queue.ts b/services/gastown/src/dos/town/review-queue.ts index e25819107..da6402496 100644 --- a/services/gastown/src/dos/town/review-queue.ts +++ b/services/gastown/src/dos/town/review-queue.ts @@ -208,53 +208,6 @@ export function submitToReviewQueue(sql: SqlStorage, input: ReviewQueueInput): v }); } -export function popReviewQueue(sql: SqlStorage): ReviewQueueEntry | null { - // Pop the oldest open MR bead, but skip any whose source bead already - // has another MR in_progress (i.e. a refinery is already reviewing it). - // This prevents popping stale MR beads and triggering failReviewWithRework - // while an active review is in flight for the same source. - // - // The source bead is linked via bead_dependencies (dependency_type='tracks'): - // bead_dependencies.bead_id = MR bead - // bead_dependencies.depends_on_bead_id = source bead - const rows = [ - ...query( - sql, - /* sql */ ` - ${REVIEW_JOIN} - WHERE ${beads.status} = 'open' - AND NOT EXISTS ( - SELECT 1 FROM ${beads} AS active_mr - WHERE active_mr.${beads.columns.type} = 'merge_request' - AND active_mr.${beads.columns.status} = 'in_progress' - AND active_mr.${beads.columns.rig_id} = ${beads.rig_id} - ) - ORDER BY ${beads.created_at} ASC - LIMIT 1 - `, - [] - ), - ]; - - if (rows.length === 0) return null; - const parsed = MergeRequestBeadRecord.parse(rows[0]); - const entry = toReviewQueueEntry(parsed); - - // Mark as running (in_progress) - query( - sql, - /* sql */ ` - UPDATE ${beads} - SET ${beads.columns.status} = 'in_progress', - ${beads.columns.updated_at} = ? - WHERE ${beads.bead_id} = ? - `, - [now(), entry.id] - ); - - return { ...entry, status: 'running', processed_at: now() }; -} - export function completeReview( sql: SqlStorage, entryId: string, @@ -369,8 +322,8 @@ export function completeReviewWithResult( conflict: true, }, }); - // Return source bead to open so the normal scheduling path handles - // rework. Clear assignee so feedStrandedConvoys can match. + // Return source bead to open so the reconciler's scheduling path handles + // rework. Clear assignee so the reconciler can match it for dispatch. const conflictSourceBead = getBead(sql, entry.bead_id); if ( conflictSourceBead && @@ -390,11 +343,10 @@ export function completeReviewWithResult( } } else if (input.status === 'failed') { // Review failed (rework requested): return source bead to open so - // the normal scheduling path (feedStrandedConvoys → hookBead → - // schedulePendingWork → dispatch) handles rework. Clear the stale - // assignee so feedStrandedConvoys can match (requires assignee IS NULL). - // This avoids the fire-and-forget rework dispatch race in TownDO - // where the dispatch fails and rehookOrphanedBeads churn. + // the reconciler's scheduling path handles rework. Clear the stale + // assignee so the reconciler can match it for dispatch (requires + // assignee IS NULL). This avoids a fire-and-forget rework dispatch + // race where the dispatch fails and the bead churns. const sourceBead = getBead(sql, entry.bead_id); if (sourceBead && sourceBead.status !== 'closed' && sourceBead.status !== 'failed') { updateBeadStatus(sql, entry.bead_id, 'open', entry.agent_id); @@ -498,9 +450,8 @@ export function agentDone(sql: SqlStorage, agentId: string, input: AgentDoneInpu const agent = getAgent(sql, agentId); if (!agent) throw new Error(`Agent ${agentId} not found`); if (!agent.current_hook_bead_id) { - // The agent was unhooked by a recovery path (witnessPatrol, - // rehookOrphanedBeads) between when the agent finished work and - // when it called gt_done. + // The agent was unhooked by a recovery path between when the agent + // finished work and when it called gt_done. // // For refineries, this is critical: the refinery successfully merged // but the hook was cleared by zombie detection. We MUST still complete @@ -648,9 +599,9 @@ export function agentDone(sql: SqlStorage, agentId: string, input: AgentDoneInpu unhookBead(sql, agentId); // Set refinery to idle immediately — the review is done and the - // refinery is available for new work. Without this, processReviewQueue - // sees the refinery as 'working' and won't pop the next MR bead until - // agentCompleted fires (when the container process eventually exits). + // refinery is available for new work. Without this, the reconciler + // sees the refinery as 'working' and won't dispatch the next MR bead + // until agentCompleted fires (when the container process eventually exits). updateAgentStatus(sql, agentId, 'idle'); return; } @@ -659,7 +610,7 @@ export function agentDone(sql: SqlStorage, agentId: string, input: AgentDoneInpu if (!agent.rig_id) { console.warn( - `[review-queue] agentDone: agent ${agentId} has null rig_id — review entry may fail in processReviewQueue` + `[review-queue] agentDone: agent ${agentId} has null rig_id — review entry may fail in submitToReviewQueue` ); } @@ -718,13 +669,13 @@ export function agentCompleted( // NEVER fail or unhook a refinery from agentCompleted. // agentCompleted races with gt_done: the process exits, the // container sends /completed, but gt_done's HTTP request may - // still be in flight. If we unhook here, recoverStuckReviews - // can fire between agentCompleted and gt_done, resetting the - // MR bead that's about to be closed by gt_done. + // still be in flight. If we unhook here, a recovery path can + // fire between agentCompleted and gt_done, resetting the MR bead + // that's about to be closed by gt_done. // // Leave the hook intact. gt_done will close + unhook if the - // merge succeeded. recoverStuckReviews (which checks for - // status='working') handles the case where gt_done never arrives. + // merge succeeded. The reconciler (which checks for status='working') + // handles the case where gt_done never arrives. // // No-op for the bead — just fall through to mark agent idle. } else { diff --git a/services/gastown/test/integration/review-failure.test.ts b/services/gastown/test/integration/review-failure.test.ts index d5b7773c0..8fcb07cb0 100644 --- a/services/gastown/test/integration/review-failure.test.ts +++ b/services/gastown/test/integration/review-failure.test.ts @@ -182,29 +182,6 @@ describe('Review failure paths — convoy progress and source bead recovery', () }); }); - // ── Direct completeReview leaves source bead orphaned (regression) ─ - - describe('completeReview bypass (regression guard)', () => { - it('should leave source bead stuck in in_review when completeReview is called directly', async () => { - const { beadId, mrBeadId } = await setupConvoyWithMR(); - - // Call completeReview directly (the OLD broken path) — - // this is the raw SQL update that bypasses lifecycle events. - // We use this to verify the regression scenario. - await town.completeReview(mrBeadId, 'failed'); - - // MR bead should be failed - const mrBead = await town.getBeadAsync(mrBeadId); - expect(mrBead?.status).toBe('failed'); - - // Source bead is STILL in_review — this is the bug this PR fixes - // in processReviewQueue. The direct completeReview call doesn't - // return the source bead to in_progress. - const sourceBead = await town.getBeadAsync(beadId); - expect(sourceBead?.status).toBe('in_review'); - }); - }); - // ── Source bead in_review after agentDone ────────────────────────── describe('agentDone transitions source bead to in_review', () => { diff --git a/services/gastown/test/integration/rig-alarm.test.ts b/services/gastown/test/integration/rig-alarm.test.ts index a80cfc6b5..1ec79e696 100644 --- a/services/gastown/test/integration/rig-alarm.test.ts +++ b/services/gastown/test/integration/rig-alarm.test.ts @@ -158,9 +158,10 @@ describe('Town DO Alarm', () => { // fail gracefully and mark the review as 'failed' await runDurableObjectAlarm(town); - // The pending entry should have been popped (no more pending entries) - const nextEntry = await town.popReviewQueue(); - expect(nextEntry).toBeNull(); + // The MR bead should no longer be open (alarm processed it) + const mrBeads = await town.listBeads({ type: 'merge_request' }); + expect(mrBeads).toHaveLength(1); + expect(mrBeads[0].status).not.toBe('open'); }); }); @@ -293,9 +294,10 @@ describe('Town DO Alarm', () => { // (will fail at container level but that's expected in tests) await runDurableObjectAlarm(town); - // Review queue entry should have been popped and processed (failed in test env) - const reviewEntry = await town.popReviewQueue(); - expect(reviewEntry).toBeNull(); + // MR bead should have been picked up and processed (failed in test env) + const mrBeads = await town.listBeads({ type: 'merge_request' }); + expect(mrBeads).toHaveLength(1); + expect(mrBeads[0].status).not.toBe('open'); }); }); }); diff --git a/services/gastown/test/integration/rig-do.test.ts b/services/gastown/test/integration/rig-do.test.ts index eb22196fd..221f5bce6 100644 --- a/services/gastown/test/integration/rig-do.test.ts +++ b/services/gastown/test/integration/rig-do.test.ts @@ -356,7 +356,7 @@ describe('TownDO', () => { // ── Review Queue ─────────────────────────────────────────────────────── describe('review queue', () => { - it('should submit to and pop from review queue', async () => { + it('should submit to review queue and create an open merge_request bead', async () => { const agent = await town.registerAgent({ role: 'polecat', name: 'P1', @@ -373,40 +373,12 @@ describe('TownDO', () => { summary: 'Fixed the widget', }); - const entry = await town.popReviewQueue(); - expect(entry).toBeDefined(); - expect(entry?.branch).toBe('feature/fix-widget'); - expect(entry?.pr_url).toBe('https://github.com/org/repo/pull/1'); - expect(entry?.status).toBe('running'); - - // Pop again should return null (nothing pending) - const empty = await town.popReviewQueue(); - expect(empty).toBeNull(); - }); - - it('should complete a review', async () => { - const agent = await town.registerAgent({ - role: 'polecat', - name: 'P1', - identity: `complete-review-${townName}`, - }); - const bead = await town.createBead({ type: 'issue', title: 'Review complete' }); - - await town.submitToReviewQueue({ - agent_id: agent.id, - bead_id: bead.bead_id, - rig_id: 'test-rig', - branch: 'feature/fix', - }); - - const entry = await town.popReviewQueue(); - expect(entry).toBeDefined(); - - await town.completeReview(entry!.id, 'merged'); - - // Pop again should be null - const empty = await town.popReviewQueue(); - expect(empty).toBeNull(); + // submitToReviewQueue creates an open merge_request bead + const mrBeads = await town.listBeads({ type: 'merge_request' }); + expect(mrBeads).toHaveLength(1); + expect(mrBeads[0].status).toBe('open'); + expect(mrBeads[0].metadata?.pr_url).toBe('https://github.com/org/repo/pull/1'); + expect(mrBeads[0].metadata?.source_bead_id).toBe(bead.bead_id); }); it('should close bead on successful merge via completeReviewWithResult', async () => { @@ -424,11 +396,12 @@ describe('TownDO', () => { branch: 'feature/merge-test', }); - const entry = await town.popReviewQueue(); - expect(entry).toBeDefined(); + const mrBeads = await town.listBeads({ type: 'merge_request' }); + expect(mrBeads).toHaveLength(1); + const mrBeadId = mrBeads[0].bead_id; await town.completeReviewWithResult({ - entry_id: entry!.id, + entry_id: mrBeadId, status: 'merged', message: 'Merge successful', commit_sha: 'abc123', @@ -439,9 +412,9 @@ describe('TownDO', () => { expect(updatedBead?.status).toBe('closed'); expect(updatedBead?.closed_at).toBeDefined(); - // Review queue should be empty - const empty = await town.popReviewQueue(); - expect(empty).toBeNull(); + // MR bead should be closed + const updatedMr = await town.getBeadAsync(mrBeadId); + expect(updatedMr?.status).toBe('closed'); }); it('should create escalation bead on merge conflict via completeReviewWithResult', async () => { @@ -459,11 +432,12 @@ describe('TownDO', () => { branch: 'feature/conflict-test', }); - const entry = await town.popReviewQueue(); - expect(entry).toBeDefined(); + const mrBeads = await town.listBeads({ type: 'merge_request' }); + expect(mrBeads).toHaveLength(1); + const mrBeadId = mrBeads[0].bead_id; await town.completeReviewWithResult({ - entry_id: entry!.id, + entry_id: mrBeadId, status: 'conflict', message: 'CONFLICT (content): Merge conflict in src/index.ts', }); @@ -484,9 +458,9 @@ describe('TownDO', () => { agent_id: agent.id, }); - // Review queue entry should be marked as failed - const empty = await town.popReviewQueue(); - expect(empty).toBeNull(); + // MR bead should be marked as failed + const updatedMr = await town.getBeadAsync(mrBeadId); + expect(updatedMr?.status).toBe('failed'); }); }); From 4116c5275a0b950ba0704b650b914c5ea1dcb391 Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Fri, 10 Apr 2026 17:01:14 -0500 Subject: [PATCH 04/11] fix(gastown): prevent triage batch bead dispatch loop with wrong system prompt (#2321) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(gastown): prevent triage batch bead dispatch loop with wrong system prompt Option A: Mark triage batch bead as in_progress immediately after hookBead() in maybeDispatchTriageAgent(), before awaiting startAgentInContainer(). This prevents reconciler Rule 2 (idle agent + open hooked bead → dispatch_agent) from re-dispatching the triage bead with the generic polecat prompt on the next tick when container start fails. Rule 3 (stale in_progress, 5-min timeout) resets it to open for a clean retry via maybeDispatchTriageAgent. Option B (defense-in-depth): In applyActionCtx.dispatchAgent, detect triage batch beads (gt:triage label + created_by='patrol') and inject the triage system prompt, ensuring the polecat gets the correct tools even if Rule 2 somehow fires against an open triage batch bead. Fixes #1958 * fix(gastown): set rig_id on triage batch bead so reconciler Rule 1 can re-dispatch after timeout Without rig_id, when Rule 3 resets an abandoned in_progress triage batch bead to 'open', Rule 1 skips it (requires rig_id IS NOT NULL). This left the bead permanently 'open', blocking maybeDispatchTriageAgent from creating a replacement. Setting rig_id ensures Rule 1 can re-dispatch the existing bead (with triage system prompt injected via Option B). --------- Co-authored-by: John Fawcett --- services/gastown/src/dos/Town.do.ts | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/services/gastown/src/dos/Town.do.ts b/services/gastown/src/dos/Town.do.ts index f274cebcf..97398f20e 100644 --- a/services/gastown/src/dos/Town.do.ts +++ b/services/gastown/src/dos/Town.do.ts @@ -343,6 +343,17 @@ export class TownDO extends DurableObject { }); } + // Option B (defense-in-depth): If the reconciler re-dispatches an + // open triage batch bead (gt:triage, created_by='patrol') — e.g. + // because Option A's in_progress transition was somehow bypassed — + // inject the triage system prompt so the polecat gets the correct + // tools and instructions instead of the generic polecat prompt. + if (bead.labels.includes(patrol.TRIAGE_BATCH_LABEL) && bead.created_by === 'patrol') { + const pendingRequests = patrol.listPendingTriageRequests(this.sql); + const { buildTriageSystemPrompt } = await import('../prompts/triage-system.prompt'); + systemPromptOverride = buildTriageSystemPrompt(pendingRequests); + } + return scheduling.dispatchAgent(schedulingCtx, agent, bead, { systemPromptOverride, }); @@ -4055,6 +4066,9 @@ export class TownDO extends DurableObject { const systemPrompt = buildTriageSystemPrompt(pendingRequests); // Only now create the synthetic bead — preconditions are verified. + // Set rig_id so that if Rule 3 resets this bead to 'open' after a + // dispatch timeout, Rule 1 of the reconciler can pick it up and + // re-dispatch it (with the correct triage system prompt via Option B). const triageBead = beadOps.createBead(this.sql, { type: 'issue', title: `Triage batch: ${pendingCount} request(s)`, @@ -4062,11 +4076,20 @@ export class TownDO extends DurableObject { priority: 'high', labels: [patrol.TRIAGE_BATCH_LABEL], created_by: 'patrol', + rig_id: rigId, }); const triageAgent = agents.getOrCreateAgent(this.sql, 'polecat', rigId, this.townId); agents.hookBead(this.sql, triageAgent.id, triageBead.bead_id); + // Option A: Immediately mark the triage batch bead as in_progress so + // the reconciler's Rule 2 (idle agent + open hooked bead → dispatch_agent) + // does not re-fire on the next tick if the container start fails. Rule 3 + // (stale in_progress bead + no working agent + 5-min timeout) will reset + // it back to open if the dispatch fails, allowing a clean retry via + // maybeDispatchTriageAgent with the correct triage system prompt. + beadOps.updateBeadStatus(this.sql, triageBead.bead_id, 'in_progress', triageAgent.id); + const started = await dispatch.startAgentInContainer(this.env, this.ctx.storage, { townId: this.townId, rigId, From 10f80654e3f456bce8c0384fd24aa84d32728623 Mon Sep 17 00:00:00 2001 From: Breno Colom Date: Mon, 13 Apr 2026 16:56:26 +0200 Subject: [PATCH 05/11] feat(gastown): add cmake and pkg-config to container images (#2060) Add remaining build-essentials packages (cmake, pkg-config) to both prod and dev Dockerfiles. build-essential and libssl-dev were already present. --- services/gastown/container/Dockerfile | 4 +++- services/gastown/container/Dockerfile.dev | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/services/gastown/container/Dockerfile b/services/gastown/container/Dockerfile index b835a3332..225be7ee2 100644 --- a/services/gastown/container/Dockerfile +++ b/services/gastown/container/Dockerfile @@ -4,7 +4,7 @@ FROM oven/bun:1-slim # Package categories: # version control: git, git-lfs # network/download: curl, wget, ca-certificates, gnupg, unzip -# build toolchain: build-essential, autoconf +# build toolchain: build-essential, autoconf, cmake, pkg-config # search tools: ripgrep, jq # compression: bzip2, zstd # SSL/crypto: libssl-dev, libffi-dev @@ -27,6 +27,8 @@ RUN apt-get update && \ unzip \ build-essential \ autoconf \ + cmake \ + pkg-config \ ripgrep \ jq \ bzip2 \ diff --git a/services/gastown/container/Dockerfile.dev b/services/gastown/container/Dockerfile.dev index a4bebc5db..b8680462b 100644 --- a/services/gastown/container/Dockerfile.dev +++ b/services/gastown/container/Dockerfile.dev @@ -4,7 +4,7 @@ FROM --platform=linux/arm64 oven/bun:1-slim # Package categories: # version control: git, git-lfs # network/download: curl, wget, ca-certificates, gnupg, unzip -# build toolchain: build-essential, autoconf +# build toolchain: build-essential, autoconf, cmake, pkg-config # search tools: ripgrep, jq # compression: bzip2, zstd # SSL/crypto: libssl-dev, libffi-dev @@ -27,6 +27,8 @@ RUN apt-get update && \ unzip \ build-essential \ autoconf \ + cmake \ + pkg-config \ ripgrep \ jq \ bzip2 \ From afef7d50c3329005fa5b6268104249a5e65a97fc Mon Sep 17 00:00:00 2001 From: Breno Colom Date: Mon, 13 Apr 2026 16:56:37 +0200 Subject: [PATCH 06/11] feat(gastown): add Java JDK to container images (#2066) Install default-jdk (OpenJDK) in both prod and dev Dockerfiles to support Java project builds and runtime. --- services/gastown/container/Dockerfile | 2 ++ services/gastown/container/Dockerfile.dev | 2 ++ 2 files changed, 4 insertions(+) diff --git a/services/gastown/container/Dockerfile b/services/gastown/container/Dockerfile index 225be7ee2..44afc48b6 100644 --- a/services/gastown/container/Dockerfile +++ b/services/gastown/container/Dockerfile @@ -16,6 +16,7 @@ FROM oven/bun:1-slim # C++ stdlib: libc++1 # math: libgmp-dev # timezone data: tzdata +# Java: default-jdk RUN apt-get update && \ apt-get install -y --no-install-recommends \ git \ @@ -49,6 +50,7 @@ RUN apt-get update && \ libc++1 \ libgmp-dev \ tzdata \ + default-jdk \ && curl -fsSL https://deb.nodesource.com/setup_24.x | bash - \ && apt-get install -y --no-install-recommends nodejs \ && curl -fsSL https://cli.github.com/packages/githubcli-archive-keyring.gpg \ diff --git a/services/gastown/container/Dockerfile.dev b/services/gastown/container/Dockerfile.dev index b8680462b..5e5f00ace 100644 --- a/services/gastown/container/Dockerfile.dev +++ b/services/gastown/container/Dockerfile.dev @@ -16,6 +16,7 @@ FROM --platform=linux/arm64 oven/bun:1-slim # C++ stdlib: libc++1 # math: libgmp-dev # timezone data: tzdata +# Java: default-jdk RUN apt-get update && \ apt-get install -y --no-install-recommends \ git \ @@ -49,6 +50,7 @@ RUN apt-get update && \ libc++1 \ libgmp-dev \ tzdata \ + default-jdk \ && curl -fsSL https://deb.nodesource.com/setup_24.x | bash - \ && apt-get install -y --no-install-recommends nodejs \ && curl -fsSL https://cli.github.com/packages/githubcli-archive-keyring.gpg \ From f1525c9c5ad192dcd07bc7ca9de3b99e3b488d7d Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Mon, 13 Apr 2026 10:22:15 -0500 Subject: [PATCH 07/11] fix(gastown): propagate custom env_vars to running containers on settings save (#2366) * fix(gastown): propagate custom env_vars to running containers on settings save Three gaps fixed: 1. syncTownConfigToProcessEnv() now applies custom env_vars from town config to process.env, with tracking of previously-applied keys so removed vars are deleted from process.env. 2. syncConfigToContainer() now persists custom env_vars to TownContainerDO storage (via container.setEnvVar/deleteEnvVar) so they survive container restarts. Previously-persisted custom keys are tracked in DO storage and cleaned up on removal. 3. updateAgentModel() hot-swap now overlays fresh custom env_vars from getCurrentTownConfig() over the stale startupEnv snapshot. Infra keys in LIVE_ENV_KEYS always take precedence. * fix(gastown): guard custom env_vars against reserved key override - control-server: export getLastAppliedEnvVarKeys() for process-manager - process-manager: delete stale custom keys from hotSwapEnv on hot-swap - Town.do: skip RESERVED_ENV_KEYS when setting custom env_vars on container Addresses 3 review warnings about custom env_vars overriding infra keys. * fix: skip reserved env keys in prevCustomKeys cleanup loop prevCustomKeys may contain reserved keys persisted by the previous implementation (before the RESERVED_ENV_KEYS filter was added). Without this guard the cleanup loop would delete managed infra values like KILOCODE_TOKEN that were just written by envMapping. --------- Co-authored-by: John Fawcett --- .../gastown/container/src/control-server.ts | 49 +++++++++++++++++++ .../gastown/container/src/process-manager.ts | 28 +++++++++++ services/gastown/src/dos/Town.do.ts | 47 ++++++++++++++++++ 3 files changed, 124 insertions(+) diff --git a/services/gastown/container/src/control-server.ts b/services/gastown/container/src/control-server.ts index b56eb97e6..9aef9defb 100644 --- a/services/gastown/container/src/control-server.ts +++ b/services/gastown/container/src/control-server.ts @@ -43,11 +43,39 @@ const TownConfigHeader = z.record(z.string(), z.unknown()); // Used as a fallback by code that runs outside a request context (e.g. background tasks). let lastKnownTownConfig: Record | null = null; +// Track which custom env var keys were applied last sync so removed keys can be cleared. +let lastAppliedEnvVarKeys = new Set(); + +// Env keys managed by the control plane that custom env_vars must never override. +// If a custom key collides with a reserved key, the infra value wins and the +// custom value is silently ignored — matching the !(key in env) guard in buildAgentEnv. +const RESERVED_ENV_KEYS = new Set([ + 'KILOCODE_TOKEN', + 'GIT_TOKEN', + 'GITHUB_TOKEN', + 'GITLAB_TOKEN', + 'GITLAB_INSTANCE_URL', + 'GITHUB_CLI_PAT', + 'GH_TOKEN', + 'GASTOWN_GIT_AUTHOR_NAME', + 'GASTOWN_GIT_AUTHOR_EMAIL', + 'GASTOWN_DISABLE_AI_COAUTHOR', + 'GASTOWN_ORGANIZATION_ID', + 'GASTOWN_CONTAINER_TOKEN', + 'GASTOWN_SESSION_TOKEN', + 'GASTOWN_API_URL', +]); + /** Get the latest town config delivered via X-Town-Config header. */ export function getCurrentTownConfig(): Record | null { return lastKnownTownConfig; } +/** Get the set of custom env var keys applied in the last sync. */ +export function getLastAppliedEnvVarKeys(): Set { + return lastAppliedEnvVarKeys; +} + /** * Sync config-derived env vars from the last-known town config into * process.env. Safe to call at any time — no-ops when no config is cached. @@ -102,6 +130,27 @@ function syncTownConfigToProcessEnv(): void { } else { delete process.env.GASTOWN_ORGANIZATION_ID; } + + // Apply custom env_vars from the town config. Reserved infra keys are + // skipped so the control-plane values always take precedence — matching the + // !(key in env) guard in buildAgentEnv. + const rawEnvVars = cfg.env_vars; + const customEnvVars: Record = + rawEnvVars !== null && typeof rawEnvVars === 'object' && !Array.isArray(rawEnvVars) + ? (rawEnvVars as Record) + : {}; + const newCustomKeys = new Set(Object.keys(customEnvVars)); + // Remove keys that were present in the previous sync but are gone now. + // Skip reserved keys — deleting those would wipe a control-plane value. + for (const key of lastAppliedEnvVarKeys) { + if (!newCustomKeys.has(key) && !RESERVED_ENV_KEYS.has(key)) delete process.env[key]; + } + // Apply current custom env vars, skipping reserved keys. + for (const [key, value] of Object.entries(customEnvVars)) { + if (RESERVED_ENV_KEYS.has(key)) continue; + process.env[key] = String(value); + } + lastAppliedEnvVarKeys = newCustomKeys; } export const app = new Hono(); diff --git a/services/gastown/container/src/process-manager.ts b/services/gastown/container/src/process-manager.ts index 77aa4bb41..659a2c2de 100644 --- a/services/gastown/container/src/process-manager.ts +++ b/services/gastown/container/src/process-manager.ts @@ -12,6 +12,7 @@ import * as fs from 'node:fs/promises'; import type { ManagedAgent, StartAgentRequest } from './types'; import { reportAgentCompleted, reportMayorWaiting } from './completion-reporter'; import { buildKiloConfigContent } from './agent-runner'; +import { getCurrentTownConfig, getLastAppliedEnvVarKeys } from './control-server'; import { log } from './logger'; const MANAGER_LOG = '[process-manager]'; @@ -1264,6 +1265,33 @@ export async function updateAgentModel( if (live) hotSwapEnv[key] = live; } + // Overlay custom env_vars from the town config so hot-swap picks up + // values that were added/changed after the initial dispatch. Infra + // keys in LIVE_ENV_KEYS always take precedence (they were already + // populated from process.env above), so custom vars cannot override. + const freshConfig = getCurrentTownConfig(); + const freshEnvVars = freshConfig?.env_vars; + const freshCustomKeySet = new Set(); + if (freshEnvVars !== null && typeof freshEnvVars === 'object' && !Array.isArray(freshEnvVars)) { + for (const [key, value] of Object.entries(freshEnvVars as Record)) { + if (LIVE_ENV_KEYS.has(key)) continue; + freshCustomKeySet.add(key); + if (value !== undefined && value !== null) { + hotSwapEnv[key] = String(value); + } else { + delete hotSwapEnv[key]; + } + } + } + // Remove stale custom env vars — keys that were applied in a previous + // sync but are no longer in the town config. Without this, startupEnv + // keeps carrying deleted custom keys through every hot-swap. + for (const key of getLastAppliedEnvVarKeys()) { + if (!freshCustomKeySet.has(key) && !LIVE_ENV_KEYS.has(key)) { + delete hotSwapEnv[key]; + } + } + // Re-derive GH_TOKEN from live values using the same priority chain // as buildAgentEnv: GITHUB_CLI_PAT > GIT_TOKEN > GITHUB_TOKEN. // syncConfigToContainer updates these on process.env, but buildAgentEnv diff --git a/services/gastown/src/dos/Town.do.ts b/services/gastown/src/dos/Town.do.ts index 97398f20e..695c87ea7 100644 --- a/services/gastown/src/dos/Town.do.ts +++ b/services/gastown/src/dos/Town.do.ts @@ -797,6 +797,53 @@ export class TownDO extends DurableObject { } } + // Persist custom env_vars to DO storage so they survive container restarts. + // Compare against the previously-persisted set of keys to clear removed ones. + // Reserved infra keys are never overwritten or deleted — infra values always win. + const RESERVED_ENV_KEYS = new Set([ + 'KILOCODE_TOKEN', + 'GIT_TOKEN', + 'GITHUB_TOKEN', + 'GITLAB_TOKEN', + 'GITLAB_INSTANCE_URL', + 'GITHUB_CLI_PAT', + 'GH_TOKEN', + 'GASTOWN_GIT_AUTHOR_NAME', + 'GASTOWN_GIT_AUTHOR_EMAIL', + 'GASTOWN_DISABLE_AI_COAUTHOR', + 'GASTOWN_ORGANIZATION_ID', + 'GASTOWN_CONTAINER_TOKEN', + 'GASTOWN_SESSION_TOKEN', + 'GASTOWN_API_URL', + ]); + const CUSTOM_ENV_KEYS_STORAGE_KEY = 'container:custom_env_var_keys'; + const prevCustomKeys: string[] = + (await this.ctx.storage.get(CUSTOM_ENV_KEYS_STORAGE_KEY)) ?? []; + const newCustomKeys = Object.keys(townConfig.env_vars).filter( + key => !RESERVED_ENV_KEYS.has(key) + ); + const newCustomKeySet = new Set(newCustomKeys); + + for (const key of prevCustomKeys) { + if (RESERVED_ENV_KEYS.has(key)) continue; + if (!newCustomKeySet.has(key)) { + try { + await container.deleteEnvVar(key); + } catch (err) { + console.warn(`[Town.do] syncConfigToContainer: delete custom ${key} failed:`, err); + } + } + } + for (const [key, value] of Object.entries(townConfig.env_vars)) { + if (RESERVED_ENV_KEYS.has(key)) continue; + try { + await container.setEnvVar(key, value); + } catch (err) { + console.warn(`[Town.do] syncConfigToContainer: set custom ${key} failed:`, err); + } + } + await this.ctx.storage.put(CUSTOM_ENV_KEYS_STORAGE_KEY, newCustomKeys); + // Phase 2: Push to the running container's process.env via the // /sync-config endpoint. The X-Town-Config header delivers the // full config; the endpoint applies CONFIG_ENV_MAP to process.env. From 97838eed8da100a88173be511ebc78b644473b00 Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Mon, 13 Apr 2026 10:45:01 -0500 Subject: [PATCH 08/11] chore(gastown): remove dead code from patrol/scheduling/review-queue (#1403) (#2339) chore(gastown): remove dead GUPP_WARN_MS export and update stale patrol/queue comments - Remove unused GUPP_WARN_MS constant export from patrol.ts (never referenced outside the file) - Update completion-reporter.ts JSDoc: replace stale witnessPatrol/schedulePendingWork references with reconciler-based description - Update control-server.ts comments: replace stale processReviewQueue/recoverStuckReviews references with current TownDO terminology Part of issue #1403 dead code cleanup. Co-authored-by: John Fawcett --- services/gastown/container/src/completion-reporter.ts | 7 +++---- services/gastown/container/src/control-server.ts | 8 ++++---- services/gastown/src/dos/town/patrol.ts | 2 -- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/services/gastown/container/src/completion-reporter.ts b/services/gastown/container/src/completion-reporter.ts index 7a7766e61..04ece97b1 100644 --- a/services/gastown/container/src/completion-reporter.ts +++ b/services/gastown/container/src/completion-reporter.ts @@ -1,8 +1,7 @@ /** - * Reports agent completion/failure back to the Rig DO via the Gastown - * worker API. This closes the bead and unhooks the agent, preventing - * the infinite retry loop where witnessPatrol resets the agent to idle - * and schedulePendingWork re-dispatches it. + * Reports agent completion/failure back to the Gastown worker API. + * This closes the bead and unhooks the agent so the reconciler does not + * re-dispatch it. */ import type { ManagedAgent } from './types'; diff --git a/services/gastown/container/src/control-server.ts b/services/gastown/container/src/control-server.ts index 9aef9defb..de43a9546 100644 --- a/services/gastown/container/src/control-server.ts +++ b/services/gastown/container/src/control-server.ts @@ -520,9 +520,9 @@ app.post('/repos/setup', async c => { // POST /git/merge // Deterministic merge of a polecat branch into the target branch. -// Called by the Rig DO's processReviewQueue → startMergeInContainer. -// Runs the merge synchronously and reports the result back to the Rig DO -// via a callback to the completeReview endpoint. +// Called by the TownDO's startMergeInContainer. +// Runs the merge synchronously and reports the result back via a callback +// to the completeReview endpoint. app.post('/git/merge', async c => { const body: unknown = await c.req.json().catch(() => null); const parsed = MergeRequest.safeParse(body); @@ -588,7 +588,7 @@ app.post('/git/merge', async c => { } }; - // Fire and forget — the Rig DO will time out stuck entries via recoverStuckReviews + // Fire and forget — the TownDO will time out stuck entries via its alarm loop doMerge().catch(err => { console.error(`Merge failed for entry ${req.entryId}:`, err); }); diff --git a/services/gastown/src/dos/town/patrol.ts b/services/gastown/src/dos/town/patrol.ts index b1617599a..217262323 100644 --- a/services/gastown/src/dos/town/patrol.ts +++ b/services/gastown/src/dos/town/patrol.ts @@ -17,8 +17,6 @@ const LOG = '[patrol]'; // ── Thresholds ────────────────────────────────────────────────────── -/** First GUPP warning (existing behavior) */ -export const GUPP_WARN_MS = 30 * 60_000; // 30 min /** Escalate to mayor after second threshold */ export const GUPP_ESCALATE_MS = 60 * 60_000; // 1h /** Force-stop agent after third threshold */ From 0a8188bf90be3c932a8428eea3f1888f6b16dab2 Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Mon, 13 Apr 2026 11:11:02 -0500 Subject: [PATCH 09/11] fix(gastown): break create_landing_mr infinite loop (#2260) (#2371) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(gastown): break create_landing_mr infinite loop (#2260) Add circuit breaker for landing MR creation to prevent runaway retry loops when convoys have no PR URLs. A town accumulated 5,335 failed actions over 41 hours before this fix. - Fix 1: Deduplicate MR bead creation — skip if an open/in_progress landing MR already exists for the convoy - Fix 2: Max 5 landing MR attempts with exponential cooldown (30s base, 30min cap), fail the convoy when exhausted - Fix 3: PR URL validation guard — skip landing MR creation when no tracked beads have a pr_url - Fix 4: Move convoy fail check before update_convoy_progress to prevent the race where progress updates are emitted for convoys about to be failed/closed Store landing_mr_attempts and last_landing_mr_attempt_at in the convoy bead's metadata JSON field (no schema migration needed). Add FailConvoy action type for explicit convoy failure. * fix(gastown): move max-attempts check after landing MR status lookup The max landing MR attempts guard was firing before checking whether the final landing MR was still active or already merged, making the last allowed attempt impossible to succeed. Now we check landing MR status first and only fail the convoy when no landing MR is active or merged. --------- Co-authored-by: John Fawcett --- services/gastown/src/dos/town/actions.ts | 45 +++- services/gastown/src/dos/town/reconciler.ts | 223 +++++++++++++------- 2 files changed, 189 insertions(+), 79 deletions(-) diff --git a/services/gastown/src/dos/town/actions.ts b/services/gastown/src/dos/town/actions.ts index d7fdfee53..bb1482d03 100644 --- a/services/gastown/src/dos/town/actions.ts +++ b/services/gastown/src/dos/town/actions.ts @@ -132,6 +132,12 @@ const CloseConvoy = z.object({ convoy_id: z.string(), }); +const FailConvoy = z.object({ + type: z.literal('fail_convoy'), + convoy_id: z.string(), + reason: z.string(), +}); + // ── Side effects (deferred) ───────────────────────────────────────── const DispatchAgent = z.object({ @@ -206,6 +212,7 @@ export const Action = z.discriminatedUnion('type', [ UpdateConvoyProgress, SetConvoyReadyToLand, CloseConvoy, + FailConvoy, // Side effects DispatchAgent, StopAgent, @@ -239,6 +246,7 @@ export type DeleteAgent = z.infer; export type UpdateConvoyProgress = z.infer; export type SetConvoyReadyToLand = z.infer; export type CloseConvoy = z.infer; +export type FailConvoy = z.infer; export type DispatchAgent = z.infer; export type StopAgent = z.infer; export type PollPr = z.infer; @@ -397,7 +405,22 @@ export function applyAction(ctx: ApplyActionContext, action: Action): (() => Pro } case 'create_landing_mr': { - // Create an MR bead for the landing merge (feature branch → main) + const timestamp = now(); + query( + sql, + /* sql */ ` + UPDATE ${beads} + SET ${beads.columns.metadata} = json_set( + COALESCE(${beads.columns.metadata}, '{}'), + '$.landing_mr_attempts', + COALESCE(json_extract(${beads.columns.metadata}, '$.landing_mr_attempts'), 0) + 1, + '$.last_landing_mr_attempt_at', ? + ), + ${beads.columns.updated_at} = ? + WHERE ${beads.bead_id} = ? + `, + [timestamp, timestamp, action.convoy_id] + ); reviewQueue.submitToReviewQueue(sql, { agent_id: 'system', bead_id: action.convoy_id, @@ -592,7 +615,6 @@ export function applyAction(ctx: ApplyActionContext, action: Action): (() => Pro } case 'close_convoy': { - // Use updateBeadStatus for terminal state guard + bead event logging beadOps.updateBeadStatus(sql, action.convoy_id, 'closed', 'system'); query( sql, @@ -606,6 +628,25 @@ export function applyAction(ctx: ApplyActionContext, action: Action): (() => Pro return null; } + case 'fail_convoy': { + beadOps.updateBeadStatus(sql, action.convoy_id, 'failed', 'system'); + query( + sql, + /* sql */ ` + UPDATE ${beads} + SET ${beads.columns.metadata} = json_set( + COALESCE(${beads.columns.metadata}, '{}'), + '$.failureReason', 'landing_mr_exhausted', + '$.failureMessage', ? + ), + ${beads.columns.updated_at} = ? + WHERE ${beads.bead_id} = ? + `, + [action.reason, now(), action.convoy_id] + ); + return null; + } + // ── Side effects (deferred) ───────────────────────────────── case 'dispatch_agent': { diff --git a/services/gastown/src/dos/town/reconciler.ts b/services/gastown/src/dos/town/reconciler.ts index 111473fd4..91884509e 100644 --- a/services/gastown/src/dos/town/reconciler.ts +++ b/services/gastown/src/dos/town/reconciler.ts @@ -45,6 +45,15 @@ const CIRCUIT_BREAKER_FAILURE_THRESHOLD = 20; /** Window in minutes for counting dispatch failures. */ const CIRCUIT_BREAKER_WINDOW_MINUTES = 30; +/** Max landing MR creation attempts before failing the convoy (#2260). */ +const MAX_LANDING_MR_ATTEMPTS = 5; + +/** Base cooldown for landing MR retry: min(2^attempts * BASE, MAX) (#2260). */ +const LANDING_MR_COOLDOWN_BASE_MS = 30_000; // 30s + +/** Max cooldown for landing MR retry (#2260). */ +const LANDING_MR_COOLDOWN_MAX_MS = 30 * 60_000; // 30 min + /** * Town-level dispatch circuit breaker. Counts beads with at least one * dispatch attempt in the recent window that have not yet closed @@ -1723,14 +1732,19 @@ export function reconcileConvoys(sql: SqlStorage): Action[] { if (progressRows.length === 0) continue; const { closed_count, total_count } = progressRows[0]; - // Update progress if stale - if (closed_count !== convoy.closed_beads) { - actions.push({ - type: 'update_convoy_progress', - convoy_id: convoy.bead_id, - closed_beads: closed_count, - }); + // Parse convoy metadata for landing MR tracking fields (#2260) + let parsedMeta: Record = {}; + try { + parsedMeta = JSON.parse(convoy.metadata) as Record; + } catch { + /* ignore */ } + const landingMrAttempts = + typeof parsedMeta.landing_mr_attempts === 'number' ? parsedMeta.landing_mr_attempts : 0; + const lastLandingMrAttemptAt = + typeof parsedMeta.last_landing_mr_attempt_at === 'string' + ? parsedMeta.last_landing_mr_attempt_at + : null; // Check for in-flight MR beads (open or in_progress) for tracked issue beads const inFlightMrCount = z @@ -1759,31 +1773,36 @@ export function reconcileConvoys(sql: SqlStorage): Action[] { const hasInFlightReviews = (inFlightMrCount[0]?.cnt ?? 0) > 0; // Check if all beads done - if (closed_count >= total_count && total_count > 0 && !hasInFlightReviews) { - let parsedMeta: Record = {}; - try { - parsedMeta = JSON.parse(convoy.metadata) as Record; - } catch { - /* ignore */ - } + const allBeadsDone = closed_count >= total_count && total_count > 0 && !hasInFlightReviews; - if (convoy.merge_mode === 'review-then-land' && convoy.feature_branch) { - if (!parsedMeta.ready_to_land) { - actions.push({ - type: 'set_convoy_ready_to_land', - convoy_id: convoy.bead_id, - }); - } + // Update progress if stale (skip if we're failing/closing the convoy this tick) + if (closed_count !== convoy.closed_beads) { + actions.push({ + type: 'update_convoy_progress', + convoy_id: convoy.bead_id, + closed_beads: closed_count, + }); + } - if (parsedMeta.ready_to_land) { - // Check if a landing MR already exists (any status) - const landingMrs = z - .object({ status: z.string() }) - .array() - .parse([ - ...query( - sql, - /* sql */ ` + if (!allBeadsDone) continue; + + if (convoy.merge_mode === 'review-then-land' && convoy.feature_branch) { + if (!parsedMeta.ready_to_land) { + actions.push({ + type: 'set_convoy_ready_to_land', + convoy_id: convoy.bead_id, + }); + } + + if (parsedMeta.ready_to_land) { + // Check if a landing MR already exists (any status) + const landingMrs = z + .object({ status: z.string() }) + .array() + .parse([ + ...query( + sql, + /* sql */ ` SELECT mr.${beads.columns.status} FROM ${bead_dependencies} bd INNER JOIN ${beads} mr ON mr.${beads.columns.bead_id} = bd.${bead_dependencies.columns.bead_id} @@ -1791,36 +1810,87 @@ export function reconcileConvoys(sql: SqlStorage): Action[] { AND bd.${bead_dependencies.columns.dependency_type} = 'tracks' AND mr.${beads.columns.type} = 'merge_request' `, - [convoy.bead_id] - ), - ]); + [convoy.bead_id] + ), + ]); - // If a landing MR was already merged (closed), close the convoy - const hasMergedLanding = landingMrs.some(mr => mr.status === 'closed'); - if (hasMergedLanding) { - actions.push({ - type: 'close_convoy', - convoy_id: convoy.bead_id, - }); - continue; - } + // If a landing MR was already merged (closed), close the convoy + const hasMergedLanding = landingMrs.some(mr => mr.status === 'closed'); + if (hasMergedLanding) { + actions.push({ + type: 'close_convoy', + convoy_id: convoy.bead_id, + }); + continue; + } + + // Fix 1 (#2260): If a landing MR is active (open or in_progress), wait — don't create another + const hasActiveLanding = landingMrs.some( + mr => mr.status === 'open' || mr.status === 'in_progress' + ); + if (hasActiveLanding) continue; + + // Fix 2 (#2260): If max landing MR attempts exceeded and no landing MR is + // active or merged, fail the convoy. Checked after landing MR status lookup + // so the final allowed attempt can still succeed. + if (landingMrAttempts >= MAX_LANDING_MR_ATTEMPTS) { + actions.push({ + type: 'fail_convoy', + convoy_id: convoy.bead_id, + reason: `Landing MR creation failed after ${MAX_LANDING_MR_ATTEMPTS} attempts`, + }); + continue; + } + + // Fix 2 (#2260): Apply exponential cooldown between landing MR attempts + if (landingMrAttempts > 0 && lastLandingMrAttemptAt) { + const elapsed = Date.now() - new Date(lastLandingMrAttemptAt).getTime(); + const cooldownMs = Math.min( + Math.pow(2, landingMrAttempts) * LANDING_MR_COOLDOWN_BASE_MS, + LANDING_MR_COOLDOWN_MAX_MS + ); + if (elapsed < cooldownMs) continue; + } + + // Fix 3 (#2260): Check that tracked beads have at least one MR with a PR URL + const convoyBeadsWithPr = z + .object({ cnt: z.number() }) + .array() + .parse([ + ...query( + sql, + /* sql */ ` + SELECT count(*) as cnt + FROM ${bead_dependencies} track_dep + INNER JOIN ${bead_dependencies} mr_dep + ON mr_dep.${bead_dependencies.columns.depends_on_bead_id} = track_dep.${bead_dependencies.columns.bead_id} + INNER JOIN ${review_metadata} rm + ON rm.${review_metadata.columns.bead_id} = mr_dep.${bead_dependencies.columns.bead_id} + WHERE track_dep.${bead_dependencies.columns.depends_on_bead_id} = ? + AND track_dep.${bead_dependencies.columns.dependency_type} = 'tracks' + AND mr_dep.${bead_dependencies.columns.dependency_type} = 'tracks' + AND rm.${review_metadata.columns.pr_url} IS NOT NULL + `, + [convoy.bead_id] + ), + ]); - // If a landing MR is active (open or in_progress), wait for it - const hasActiveLanding = landingMrs.some( - mr => mr.status === 'open' || mr.status === 'in_progress' + if ((convoyBeadsWithPr[0]?.cnt ?? 0) === 0) { + console.warn( + `${LOG} convoy ${convoy.bead_id} has no beads with pr_url — skipping create_landing_mr` ); - if (hasActiveLanding) continue; - - // No landing MR exists yet — create one - { - // Need rig_id from one of the tracked beads - const rigRows = z - .object({ rig_id: z.string() }) - .array() - .parse([ - ...query( - sql, - /* sql */ ` + continue; + } + + // No landing MR exists yet and cooldown has passed — create one + { + const rigRows = z + .object({ rig_id: z.string() }) + .array() + .parse([ + ...query( + sql, + /* sql */ ` SELECT DISTINCT tracked.${beads.columns.rig_id} as rig_id FROM ${bead_dependencies} bd INNER JOIN ${beads} tracked ON tracked.${beads.columns.bead_id} = bd.${bead_dependencies.columns.bead_id} @@ -1829,29 +1899,28 @@ export function reconcileConvoys(sql: SqlStorage): Action[] { AND tracked.${beads.columns.rig_id} IS NOT NULL LIMIT 1 `, - [convoy.bead_id] - ), - ]); - - if (rigRows.length > 0) { - const rig = getRig(sql, rigRows[0].rig_id); - actions.push({ - type: 'create_landing_mr', - convoy_id: convoy.bead_id, - rig_id: rigRows[0].rig_id, - feature_branch: convoy.feature_branch, - target_branch: rig?.default_branch ?? 'main', - }); - } + [convoy.bead_id] + ), + ]); + + if (rigRows.length > 0) { + const rig = getRig(sql, rigRows[0].rig_id); + actions.push({ + type: 'create_landing_mr', + convoy_id: convoy.bead_id, + rig_id: rigRows[0].rig_id, + feature_branch: convoy.feature_branch, + target_branch: rig?.default_branch ?? 'main', + }); } } - } else { - // review-and-merge or no feature branch — auto-close - actions.push({ - type: 'close_convoy', - convoy_id: convoy.bead_id, - }); } + } else { + // review-and-merge or no feature branch — auto-close + actions.push({ + type: 'close_convoy', + convoy_id: convoy.bead_id, + }); } } From 13e175fa9d93da8ecbea9f21b4253cb72401eb1c Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Tue, 14 Apr 2026 18:40:47 +0100 Subject: [PATCH 10/11] fix(gastown): prevent deleteAgent from reopening terminal beads; bump max_instances to 800 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit deleteAgent() ran a blanket UPDATE SET status='open' on all beads assigned to the deleted agent, bypassing the terminal-state guard in updateBeadStatus(). On town boot, reconcileGC() deletes stale agents, which silently reopened closed/failed beads — causing wasted re-processing and token spend. Split into two queries: terminal beads only clear their assignee, non-terminal beads are reopened for re-dispatch as before. Also bumps container max_instances 700 → 800 and updates the image ref. --- services/gastown/src/dos/town/agents.ts | 17 ++++++++++++++++- services/gastown/wrangler.jsonc | 4 ++-- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/services/gastown/src/dos/town/agents.ts b/services/gastown/src/dos/town/agents.ts index 48d963f20..8b6c2765d 100644 --- a/services/gastown/src/dos/town/agents.ts +++ b/services/gastown/src/dos/town/agents.ts @@ -202,7 +202,21 @@ export function updateAgentStatus(sql: SqlStorage, agentId: string, status: stri } export function deleteAgent(sql: SqlStorage, agentId: string): void { - // Unassign beads that reference this agent + // Clear assignee on terminal beads (closed/failed) without reopening them. + query( + sql, + /* sql */ ` + UPDATE ${beads} + SET ${beads.columns.assignee_agent_bead_id} = NULL, + ${beads.columns.updated_at} = ? + WHERE ${beads.assignee_agent_bead_id} = ? + AND ${beads.columns.status} IN ('closed', 'failed') + `, + [now(), agentId] + ); + + // Reopen non-terminal beads assigned to this agent so the reconciler + // can re-dispatch them. query( sql, /* sql */ ` @@ -211,6 +225,7 @@ export function deleteAgent(sql: SqlStorage, agentId: string): void { ${beads.columns.status} = 'open', ${beads.columns.updated_at} = ? WHERE ${beads.assignee_agent_bead_id} = ? + AND ${beads.columns.status} NOT IN ('closed', 'failed') `, [now(), agentId] ); diff --git a/services/gastown/wrangler.jsonc b/services/gastown/wrangler.jsonc index b3fc1fac2..496894cd1 100644 --- a/services/gastown/wrangler.jsonc +++ b/services/gastown/wrangler.jsonc @@ -35,9 +35,9 @@ "containers": [ { "class_name": "TownContainerDO", - "image": "./container/Dockerfile", + "image": "registry.cloudflare.com/e115e769bcdd4c3d66af59d3332cb394/gastown-towncontainerdo:197958b7", "instance_type": "standard-4", - "max_instances": 700, + "max_instances": 800, }, ], From 4f632a1db44222c5b6c434735de966685dd34a43 Mon Sep 17 00:00:00 2001 From: John Fawcett Date: Wed, 15 Apr 2026 13:32:16 +0100 Subject: [PATCH 11/11] chore(gastown): bump max_instances to 810 --- services/gastown/wrangler.jsonc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/gastown/wrangler.jsonc b/services/gastown/wrangler.jsonc index 496894cd1..649919cee 100644 --- a/services/gastown/wrangler.jsonc +++ b/services/gastown/wrangler.jsonc @@ -37,7 +37,7 @@ "class_name": "TownContainerDO", "image": "registry.cloudflare.com/e115e769bcdd4c3d66af59d3332cb394/gastown-towncontainerdo:197958b7", "instance_type": "standard-4", - "max_instances": 800, + "max_instances": 810, }, ],