From 9d1cdff2a3ee2f959c149a7c87321e681ede03c9 Mon Sep 17 00:00:00 2001
From: Manas Srivastava <mastermanas805@gmail.com>
Date: Thu, 11 Jun 2026 01:05:23 +0530
Subject: [PATCH 1/2] fix(resources): delete-by-id, failed provision rows,
 RPC-deadline floor (Wave-2 A1)

Three related durability fixes from the cross-interface sweep:

1. DELETE /api/v1/resources/:id accepts BOTH the row id (as returned by
   GET /api/v1/resources) and the provision token. Resolution tries token
   first (historical contract preserved byte-for-byte), then falls back to
   id (resolveResourceByTokenOrID). The natural list -> delete-by-id flow
   404'd 100% of the time before. Authorization is identical for both
   forms (team-scoped, 404-not-403). The deprovision RPC / cache key /
   IAM audit are now always addressed by resource.Token, never the raw
   path param. The env-policy middleware lookup
   (ResourceEnvByTokenOrIDForMiddleware) got the same fallback so the
   id-addressed form can't skip env_policy enforcement. openapi.go
   documents the dual-form param.

2. Provision rollback persists a status='failed' row instead of
   soft-deleting it, so a failed/timed-out provision is a pollable
   terminal state. models.MarkResourceFailed (atomic pending->failed,
   mirror of MarkResourceActive) replaces models.SoftDeleteResource at
   all 16 rollback sites (db/cache/nosql/queue/vector/storage x
   anon/auth/twin + finalizeProvision). Failed rows are visible in lists
   (only 'deleted' is hidden), deletable, quota-exempt
   (CountActiveResourcesByTeamAndType filters status='active'), and
   invisible to the reconciler ('pending') and TTL reaper
   (ReapableStatuses) - no live infra backs them. Migration 070 widens
   resources_status_check with 'failed'; 024/049/057 re-add the same
   canonical set (forward-consistency: the runner re-applies every
   migration on boot - 2026-06-10 incident class). SoftDeleteResource
   itself is removed (dead after this change).

3. Provision RPC deadlines can no longer undercut the provisioner's
   dedicated pod-ready wait. cacheProvisionTimeout was a 45s literal
   while the provisioner's k8s backends wait up to 3m for pod readiness
   (redisK8sReadyTO et al.) - a cold-pod Redis provision was killed
   mid-flight. New named constants dedicatedProvisionReadyWait (3m) +
   provisionRPCDeadlineMargin (30s) derive the deadline (3m30s);
   TestProvisionTimeouts_NeverUndercutDedicatedReadyWait enforces the
   floor for every tier and every provision deadline in the package.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 docs/openapi.yaml                             |  12 +-
 .../024_resources_paused_status.sql           |  11 +-
 .../049_resources_suspended_status.sql        |   7 +-
 .../057_resources_pending_status.sql          |   5 +-
 .../070_resources_failed_status.sql           |  51 ++++
 internal/db/postgres_migrations_test.go       |   2 +-
 internal/handlers/cache.go                    |   9 +-
 .../coverage_provisioner_grpc_test.go         |  14 +-
 internal/handlers/db.go                       |   6 +-
 internal/handlers/env_policy_helpers.go       |  21 +-
 internal/handlers/env_policy_test.go          |   2 +-
 internal/handlers/finalize_provision_test.go  |  14 +-
 internal/handlers/nosql.go                    |   9 +-
 internal/handlers/openapi.go                  |   5 +-
 internal/handlers/provision_helper.go         |  14 +-
 .../provision_softdelete_final2_test.go       |  29 +-
 internal/handlers/queue.go                    |   7 +-
 internal/handlers/resource.go                 |  36 ++-
 .../handlers/resource_delete_by_id_test.go    | 271 ++++++++++++++++++
 .../handlers/resource_env_mw_final3_test.go   |  15 +-
 .../resourcestatus_conversion_test.go         |   2 +-
 internal/handlers/storage.go                  |   4 +-
 internal/handlers/vector.go                   |   4 +-
 internal/models/coverage_resource_test.go     |  27 +-
 internal/models/resource.go                   |  49 +++-
 internal/provisioner/client.go                |  51 ++--
 internal/provisioner/client_cov_test.go       |  44 ++-
 internal/router/router.go                     |   2 +-
 openapi.snapshot.json                         |   4 +
 29 files changed, 603 insertions(+), 124 deletions(-)
 create mode 100644 internal/db/migrations/070_resources_failed_status.sql
 create mode 100644 internal/handlers/resource_delete_by_id_test.go

diff --git a/docs/openapi.yaml b/docs/openapi.yaml
index c215e126..70346445 100644
--- a/docs/openapi.yaml
+++ b/docs/openapi.yaml
@@ -538,7 +538,17 @@ paths:
       security:
         - BearerAuth: []
       parameters:
-        - $ref: "#/components/parameters/id"
+        - name: id
+          in: path
+          required: true
+          description: >-
+            Accepts EITHER the resource's `id` (as returned by
+            GET /api/v1/resources) OR its provision `token`. Resolution tries
+            token first, then id; authorization (team ownership) is identical
+            for both forms.
+          schema:
+            type: string
+            format: uuid
       responses:
         "200":
           description: Resource deleted.
diff --git a/internal/db/migrations/024_resources_paused_status.sql b/internal/db/migrations/024_resources_paused_status.sql
index fa1bfe97..921b3d1e 100644
--- a/internal/db/migrations/024_resources_paused_status.sql
+++ b/internal/db/migrations/024_resources_paused_status.sql
@@ -29,11 +29,12 @@ ALTER TABLE resources
     ADD CONSTRAINT resources_status_check
     -- Forward-consistent full status set (incident 2026-06-10). The migration
     -- runner RE-APPLIES every migration on each boot; a NARROW constraint here
-    -- (missing 'suspended' [added in 049] / 'pending' [added in 057]) crashes
-    -- the boot the moment a row already holds one of those later-added — but
-    -- valid — statuses. Re-adding the canonical set makes 024 safe to re-run
-    -- regardless of data. (024/049/057 now all define the same set.)
-    CHECK (status IN ('pending', 'active', 'paused', 'suspended', 'expired', 'deleted', 'reaped'));
+    -- (missing 'suspended' [added in 049] / 'pending' [added in 057] /
+    -- 'failed' [added in 070]) crashes the boot the moment a row already holds
+    -- one of those later-added — but valid — statuses. Re-adding the canonical
+    -- set makes 024 safe to re-run regardless of data. (024/049/057/070 now
+    -- all define the same set.)
+    CHECK (status IN ('pending', 'active', 'paused', 'suspended', 'failed', 'expired', 'deleted', 'reaped'));
 
 ALTER TABLE resources ADD COLUMN IF NOT EXISTS paused_at TIMESTAMPTZ;
 
diff --git a/internal/db/migrations/049_resources_suspended_status.sql b/internal/db/migrations/049_resources_suspended_status.sql
index 36f77755..d23e0f3a 100644
--- a/internal/db/migrations/049_resources_suspended_status.sql
+++ b/internal/db/migrations/049_resources_suspended_status.sql
@@ -31,9 +31,10 @@ ALTER TABLE resources DROP CONSTRAINT IF EXISTS resources_status_check;
 ALTER TABLE resources
     ADD CONSTRAINT resources_status_check
     -- Forward-consistent full status set (incident 2026-06-10): include 'pending'
-    -- (added in 057) so re-applying 049 on boot can't crash on a valid pending
-    -- row before 057 runs. 024/049/057 now all define the same canonical set.
-    CHECK (status IN ('pending', 'active', 'paused', 'suspended', 'expired', 'deleted', 'reaped'));
+    -- (added in 057) and 'failed' (added in 070) so re-applying 049 on boot
+    -- can't crash on a valid later-added row before its own migration runs.
+    -- 024/049/057/070 now all define the same canonical set.
+    CHECK (status IN ('pending', 'active', 'paused', 'suspended', 'failed', 'expired', 'deleted', 'reaped'));
 
 -- Partial index for the auto-unsuspend scan.
 -- EnforceStorageQuotaWorker scans WHERE status = 'suspended' on every run to
diff --git a/internal/db/migrations/057_resources_pending_status.sql b/internal/db/migrations/057_resources_pending_status.sql
index 51d30016..d8a55ac9 100644
--- a/internal/db/migrations/057_resources_pending_status.sql
+++ b/internal/db/migrations/057_resources_pending_status.sql
@@ -42,7 +42,10 @@
 ALTER TABLE resources DROP CONSTRAINT IF EXISTS resources_status_check;
 ALTER TABLE resources
     ADD CONSTRAINT resources_status_check
-    CHECK (status IN ('pending', 'active', 'paused', 'suspended', 'expired', 'deleted', 'reaped'));
+    -- Forward-consistent full status set (incident 2026-06-10): include 'failed'
+    -- (added in 070) so re-applying 057 on boot can't crash on a valid failed
+    -- row before 070 runs. 024/049/057/070 now all define the same canonical set.
+    CHECK (status IN ('pending', 'active', 'paused', 'suspended', 'failed', 'expired', 'deleted', 'reaped'));
 
 -- idx_resources_pending_sweep (the partial index the reconciler scans) was
 -- already created by migration 030_resource_heartbeat.sql — it indexes
diff --git a/internal/db/migrations/070_resources_failed_status.sql b/internal/db/migrations/070_resources_failed_status.sql
new file mode 100644
index 00000000..2d398b4e
--- /dev/null
+++ b/internal/db/migrations/070_resources_failed_status.sql
@@ -0,0 +1,51 @@
+-- Migration: 070_resources_failed_status
+--
+-- Add 'failed' as a permitted value in the resources.status CHECK constraint.
+--
+-- Background (Wave-2 A1, cross-interface durability sweep 2026-06-11):
+--   When the synchronous backend provision RPC fails or times out, every
+--   provision handler used to ROLL BACK the just-created 'pending' row by
+--   soft-deleting it (status='deleted'). From the caller's point of view the
+--   resource simply VANISHED: a timed-out /db/new (or /cache/new, /nosql/new,
+--   /queue/new, /vector/new, /storage/new, twin provision) returned 503 and
+--   any follow-up GET on the token 404'd because the public read surface
+--   hides deleted rows. Agents polling for an in-flight provision had no
+--   terminal state to observe.
+--
+--   The handlers now mark the row status='failed' instead
+--   (models.MarkResourceFailed, pending→failed). A failed row is:
+--     * visible in GET /api/v1/resources (lists filter only status='deleted')
+--     * deletable via DELETE /api/v1/resources/:id (the status!='deleted'
+--       guard in SoftDeleteResourceIfActive admits it)
+--     * NOT counted toward plan quotas (quota counts filter status='active')
+--     * never swept by the provisioner_reconciler (it keys on 'pending') or
+--       the TTL reaper (ReapableStatuses() = active/paused/suspended) — the
+--       backend object either never existed or was already torn down by the
+--       rollback's best-effort cleanup, so there is nothing to deprovision.
+--
+-- Without this migration every MarkResourceFailed UPDATE would hit
+-- constraint-violation 23514 and the rollback path itself would error.
+--
+-- Status semantics (updated):
+--   pending    — row inserted, backend provision RPC + URL persistence not yet
+--                complete; the transient mid-provision state. NOT usable.
+--   active     — provisioned, accepting connections (or status-only for queue/storage/webhook)
+--   paused     — user-initiated pause (Pro+ only); infra revoked; data preserved
+--   suspended  — system-initiated suspend on storage quota breach; infra revoked
+--   failed     — terminal: the backend provision RPC failed/timed out and the
+--                rollback kept the row as a pollable terminal state. No live
+--                backing infra. Visible in lists; deletable; quota-exempt.
+--   expired    — TTL reached (anonymous resources); soft-deleted equivalent for anon
+--   deleted    — user-deleted (permanent credentials removed)
+--   reaped     — legacy: worker-reaped before 'deleted' was the canonical term
+--
+-- Idempotent: DROP IF EXISTS + re-ADD with the same syntax, so re-running on a
+-- schema that already applied this migration is harmless. Migrations 024/049/057
+-- were updated in the same change to define this same canonical set (the
+-- migration runner RE-APPLIES every migration on each boot — see the
+-- forward-consistency note in 024, incident 2026-06-10).
+
+ALTER TABLE resources DROP CONSTRAINT IF EXISTS resources_status_check;
+ALTER TABLE resources
+    ADD CONSTRAINT resources_status_check
+    CHECK (status IN ('pending', 'active', 'paused', 'suspended', 'failed', 'expired', 'deleted', 'reaped'));
diff --git a/internal/db/postgres_migrations_test.go b/internal/db/postgres_migrations_test.go
index cfb7e9f8..a2a1bb95 100644
--- a/internal/db/postgres_migrations_test.go
+++ b/internal/db/postgres_migrations_test.go
@@ -511,7 +511,7 @@ func TestConnectRedis_PanicsOnUnreachable(t *testing.T) {
 // re-applying it mid-sequence can never reject a valid row. No DB needed —
 // reads the embedded SQL via the same seam the runner uses.
 func TestMigrations_ResourcesStatusCheck_ForwardConsistent(t *testing.T) {
-	canonical := []string{"pending", "active", "paused", "suspended", "expired", "deleted", "reaped"}
+	canonical := []string{"pending", "active", "paused", "suspended", "failed", "expired", "deleted", "reaped"}
 	checked := 0
 	for _, name := range MigrationFiles() {
 		b, err := readMigrationFile(name)
diff --git a/internal/handlers/cache.go b/internal/handlers/cache.go
index 1248d184..3a403763 100644
--- a/internal/handlers/cache.go
+++ b/internal/handlers/cache.go
@@ -221,8 +221,9 @@ func (h *CacheHandler) NewCache(c *fiber.Ctx) error {
 		middleware.RecordProvisionFail("redis", middleware.ProvisionFailBackendUnavailable)
 		slog.Error("cache.new.provision_failed",
 			"error", err, "token", tokenStr, "request_id", requestID)
-		// Soft-delete the resource record so limits aren't falsely consumed.
-		if delErr := models.SoftDeleteResource(ctx, h.db, resource.ID); delErr != nil {
+		// Mark the pending row 'failed' — a pollable terminal state. Failed
+		// rows never count against quota (counts filter status='active').
+		if delErr := models.MarkResourceFailed(ctx, h.db, resource.ID); delErr != nil {
 			slog.Error("cache.new.soft_delete_failed", "error", delErr, "resource_id", resource.ID)
 		}
 		return respondProvisionFailed(c, err, "Failed to provision Redis namespace")
@@ -387,7 +388,7 @@ func (h *CacheHandler) newCacheAuthenticated(
 		middleware.RecordProvisionFail("redis", middleware.ProvisionFailBackendUnavailable)
 		slog.Error("cache.new.provision_failed_auth",
 			"error", err, "token", tokenStr, "team_id", teamIDStr, "request_id", requestID)
-		if delErr := models.SoftDeleteResource(ctx, h.db, resource.ID); delErr != nil {
+		if delErr := models.MarkResourceFailed(ctx, h.db, resource.ID); delErr != nil {
 			slog.Error("cache.new.soft_delete_failed_auth", "error", delErr, "resource_id", resource.ID)
 		}
 		return respondProvisionFailed(c, err, "Failed to provision Redis namespace")
@@ -563,7 +564,7 @@ func (h *CacheHandler) ProvisionForTwinCore(ctx context.Context, in ProvisionFor
 		middleware.RecordProvisionFail(models.ResourceTypeRedis, middleware.ProvisionFailBackendUnavailable)
 		slog.Error("twin.cache.provision_failed",
 			"error", err, "token", tokenStr, "team_id", in.TeamID, "request_id", in.RequestID)
-		if delErr := models.SoftDeleteResource(ctx, h.db, resource.ID); delErr != nil {
+		if delErr := models.MarkResourceFailed(ctx, h.db, resource.ID); delErr != nil {
 			slog.Error("twin.cache.soft_delete_failed",
 				"error", delErr, "resource_id", resource.ID, "request_id", in.RequestID)
 		}
diff --git a/internal/handlers/coverage_provisioner_grpc_test.go b/internal/handlers/coverage_provisioner_grpc_test.go
index 43062eb7..c157e260 100644
--- a/internal/handlers/coverage_provisioner_grpc_test.go
+++ b/internal/handlers/coverage_provisioner_grpc_test.go
@@ -67,7 +67,8 @@ type fakeProvisioner struct {
 	failProvision    bool
 	deprovisionCalls int
 
-	lastReq *provisionerv1.ProvisionRequest
+	lastReq            *provisionerv1.ProvisionRequest
+	lastDeprovisionReq *provisionerv1.DeprovisionRequest
 }
 
 func (f *fakeProvisioner) ProvisionResource(_ context.Context, req *provisionerv1.ProvisionRequest) (*provisionerv1.ProvisionResponse, error) {
@@ -113,13 +114,21 @@ func (f *fakeProvisioner) ProvisionResource(_ context.Context, req *provisionerv
 	}
 }
 
-func (f *fakeProvisioner) DeprovisionResource(_ context.Context, _ *provisionerv1.DeprovisionRequest) (*provisionerv1.DeprovisionResponse, error) {
+func (f *fakeProvisioner) DeprovisionResource(_ context.Context, req *provisionerv1.DeprovisionRequest) (*provisionerv1.DeprovisionResponse, error) {
 	f.mu.Lock()
 	f.deprovisionCalls++
+	f.lastDeprovisionReq = req
 	f.mu.Unlock()
 	return &provisionerv1.DeprovisionResponse{}, nil
 }
 
+// lastDeprovision returns the most recent DeprovisionRequest (nil if none).
+func (f *fakeProvisioner) lastDeprovision() *provisionerv1.DeprovisionRequest {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	return f.lastDeprovisionReq
+}
+
 func (f *fakeProvisioner) deprovisionCount() int {
 	f.mu.Lock()
 	defer f.mu.Unlock()
@@ -231,6 +240,7 @@ func setupGRPCProvFixture(t *testing.T, fake *fakeProvisioner, badAESKey bool) g
 
 	middleware.SetRoleLookupDB(db)
 	api := app.Group("/api/v1", middleware.RequireAuth(cfg), middleware.PopulateTeamRole())
+	api.Get("/resources", resourceH.List)
 	api.Get("/resources/:id", resourceH.Get)
 	api.Delete("/resources/:id", resourceH.Delete)
 	api.Post("/resources/:id/provision-twin", twinH.ProvisionTwin)
diff --git a/internal/handlers/db.go b/internal/handlers/db.go
index 583ef4cf..728ae77c 100644
--- a/internal/handlers/db.go
+++ b/internal/handlers/db.go
@@ -260,7 +260,7 @@ func (h *DBHandler) NewDB(c *fiber.Ctx) error {
 		middleware.RecordProvisionFail("postgres", middleware.ProvisionFailBackendUnavailable)
 		slog.Error("db.new.provision_failed",
 			"error", err, "token", tokenStr, "request_id", requestID)
-		if delErr := models.SoftDeleteResource(ctx, h.db, resource.ID); delErr != nil {
+		if delErr := models.MarkResourceFailed(ctx, h.db, resource.ID); delErr != nil {
 			slog.Error("db.new.soft_delete_failed", "error", delErr, "resource_id", resource.ID)
 		}
 		return respondProvisionFailed(c, err, "Failed to provision Postgres database")
@@ -438,7 +438,7 @@ func (h *DBHandler) newDBAuthenticated(
 		middleware.RecordProvisionFail("postgres", middleware.ProvisionFailBackendUnavailable)
 		slog.Error("db.new.provision_failed_auth",
 			"error", err, "token", tokenStr, "team_id", teamIDStr, "request_id", requestID)
-		if delErr := models.SoftDeleteResource(ctx, h.db, resource.ID); delErr != nil {
+		if delErr := models.MarkResourceFailed(ctx, h.db, resource.ID); delErr != nil {
 			slog.Error("db.new.soft_delete_failed_auth", "error", delErr, "resource_id", resource.ID)
 		}
 		return respondProvisionFailed(c, err, "Failed to provision Postgres database")
@@ -647,7 +647,7 @@ func (h *DBHandler) ProvisionForTwinCore(ctx context.Context, in ProvisionForTwi
 		middleware.RecordProvisionFail(models.ResourceTypePostgres, middleware.ProvisionFailBackendUnavailable)
 		slog.Error("twin.db.provision_failed",
 			"error", err, "token", tokenStr, "team_id", in.TeamID, "request_id", in.RequestID)
-		if delErr := models.SoftDeleteResource(ctx, h.db, resource.ID); delErr != nil {
+		if delErr := models.MarkResourceFailed(ctx, h.db, resource.ID); delErr != nil {
 			slog.Error("twin.db.soft_delete_failed",
 				"error", delErr, "resource_id", resource.ID, "request_id", in.RequestID)
 		}
diff --git a/internal/handlers/env_policy_helpers.go b/internal/handlers/env_policy_helpers.go
index 7d48549f..55329576 100644
--- a/internal/handlers/env_policy_helpers.go
+++ b/internal/handlers/env_policy_helpers.go
@@ -14,24 +14,27 @@ import (
 
 	"github.com/gofiber/fiber/v2"
 	"github.com/google/uuid"
-	"instant.dev/internal/models"
 )
 
-// ResourceEnvByTokenForMiddleware reads the env stored on a resource row
-// addressed by the URL :id param (a public token UUID). Returns the env on
-// success or "" on any error — the env-policy middleware fails OPEN on
-// lookup error so a malformed/non-existent :id falls through to the
-// handler's own 400/404 instead of a confusing 403/env_policy_denied.
+// ResourceEnvByTokenOrIDForMiddleware reads the env stored on a resource row
+// addressed by the URL :id param — resolved first as a public token UUID,
+// then as the row's primary-key id (resolveResourceByTokenOrID), matching
+// the DELETE handler's own resolution so the env-policy gate covers BOTH
+// address forms (without the id fallback an id-addressed DELETE would skip
+// env-policy enforcement entirely). Returns the env on success or "" on any
+// error — the env-policy middleware fails OPEN on lookup error so a
+// malformed/non-existent :id falls through to the handler's own 400/404
+// instead of a confusing 403/env_policy_denied.
 //
 // Exported with the verbose suffix so its single intended caller (the
 // router wiring) is unambiguous; this is not a general-purpose helper.
-func ResourceEnvByTokenForMiddleware(c *fiber.Ctx, db *sql.DB) (string, error) {
+func ResourceEnvByTokenOrIDForMiddleware(c *fiber.Ctx, db *sql.DB) (string, error) {
 	tokenStr := c.Params("id")
-	token, err := uuid.Parse(tokenStr)
+	pathUUID, err := uuid.Parse(tokenStr)
 	if err != nil {
 		return "", nil
 	}
-	r, err := models.GetResourceByToken(c.Context(), db, token)
+	r, err := resolveResourceByTokenOrID(c.Context(), db, pathUUID)
 	if err != nil {
 		// Including ErrResourceNotFound — fail open so the handler returns
 		// its own 404 (which contains a stable, agent-readable shape).
diff --git a/internal/handlers/env_policy_test.go b/internal/handlers/env_policy_test.go
index c789ecc8..0f130e20 100644
--- a/internal/handlers/env_policy_test.go
+++ b/internal/handlers/env_policy_test.go
@@ -73,7 +73,7 @@ func envPolicyApp(t *testing.T, db *sql.DB) *fiber.App {
 	api.Delete("/resources/:id",
 		middleware.RequireEnvAccess(middleware.EnvPolicyActionDeleteResource,
 			middleware.WithEnvLookup(func(c *fiber.Ctx) (string, error) {
-				return handlers.ResourceEnvByTokenForMiddleware(c, db)
+				return handlers.ResourceEnvByTokenOrIDForMiddleware(c, db)
 			}),
 		),
 		func(c *fiber.Ctx) error { return c.JSON(fiber.Map{"ok": true}) },
diff --git a/internal/handlers/finalize_provision_test.go b/internal/handlers/finalize_provision_test.go
index 7041bd61..28b95754 100644
--- a/internal/handlers/finalize_provision_test.go
+++ b/internal/handlers/finalize_provision_test.go
@@ -77,16 +77,18 @@ func TestFinalizeProvision_PersistenceFailure_ReturnsErrorAndRunsCleanup(t *test
 		"finalizeProvision must run the cleanup closure on persistence failure to tear down "+
 			"the just-provisioned backend object; otherwise the platform leaks an orphan")
 
-	// 3. Row is soft-deleted (status='deleted'), NOT left at 'pending' or
-	//    'active'. A pending row would be picked up by the reconciler; an
-	//    active row would falsely advertise itself as usable in dashboard
-	//    listings and quota counts.
+	// 3. Row is marked failed (status='failed'), NOT left at 'pending' or
+	//    'active', and NOT soft-deleted. A pending row would be picked up by
+	//    the reconciler; an active row would falsely advertise itself as
+	//    usable in dashboard listings and quota counts; a deleted row would
+	//    VANISH from the caller's read surface (the pre-Wave-2-A1 behaviour)
+	//    leaving no pollable terminal state.
 	var status string
 	require.NoError(t, dbConn.QueryRow(
 		`SELECT status FROM resources WHERE id = $1`, res.ID,
 	).Scan(&status))
-	assert.Equal(t, "deleted", status,
-		"on a persistence failure the row must be soft-deleted so it doesn't leak as an orphan")
+	assert.Equal(t, models.StatusFailed, status,
+		"on a persistence failure the row must be marked 'failed' — a pollable terminal state")
 }
 
 // TestFinalizeProvision_Success_FlipsToActive is the happy-path guard:
diff --git a/internal/handlers/nosql.go b/internal/handlers/nosql.go
index fb4bccd6..857ad4bd 100644
--- a/internal/handlers/nosql.go
+++ b/internal/handlers/nosql.go
@@ -217,8 +217,9 @@ func (h *NoSQLHandler) NewNoSQL(c *fiber.Ctx) error {
 		middleware.RecordProvisionFail("mongodb", middleware.ProvisionFailBackendUnavailable)
 		slog.Error("nosql.new.provision_failed",
 			"error", err, "token", tokenStr, "request_id", requestID)
-		// Soft-delete the resource record so limits aren't falsely consumed.
-		if delErr := models.SoftDeleteResource(ctx, h.db, resource.ID); delErr != nil {
+		// Mark the pending row 'failed' — a pollable terminal state. Failed
+		// rows never count against quota (counts filter status='active').
+		if delErr := models.MarkResourceFailed(ctx, h.db, resource.ID); delErr != nil {
 			slog.Error("nosql.new.soft_delete_failed", "error", delErr, "resource_id", resource.ID)
 		}
 		return respondProvisionFailed(c, err, "Failed to provision MongoDB database")
@@ -379,7 +380,7 @@ func (h *NoSQLHandler) newNoSQLAuthenticated(
 		middleware.RecordProvisionFail("mongodb", middleware.ProvisionFailBackendUnavailable)
 		slog.Error("nosql.new.provision_failed_auth",
 			"error", err, "token", tokenStr, "team_id", teamIDStr, "request_id", requestID)
-		if delErr := models.SoftDeleteResource(ctx, h.db, resource.ID); delErr != nil {
+		if delErr := models.MarkResourceFailed(ctx, h.db, resource.ID); delErr != nil {
 			slog.Error("nosql.new.soft_delete_failed_auth", "error", delErr, "resource_id", resource.ID)
 		}
 		return respondProvisionFailed(c, err, "Failed to provision MongoDB database")
@@ -567,7 +568,7 @@ func (h *NoSQLHandler) ProvisionForTwinCore(ctx context.Context, in ProvisionFor
 		middleware.RecordProvisionFail(models.ResourceTypeMongoDB, middleware.ProvisionFailBackendUnavailable)
 		slog.Error("twin.nosql.provision_failed",
 			"error", err, "token", tokenStr, "team_id", in.TeamID, "request_id", in.RequestID)
-		if delErr := models.SoftDeleteResource(ctx, h.db, resource.ID); delErr != nil {
+		if delErr := models.MarkResourceFailed(ctx, h.db, resource.ID); delErr != nil {
 			slog.Error("twin.nosql.soft_delete_failed",
 				"error", delErr, "resource_id", resource.ID, "request_id", in.RequestID)
 		}
diff --git a/internal/handlers/openapi.go b/internal/handlers/openapi.go
index 6e1fdd95..c182b729 100644
--- a/internal/handlers/openapi.go
+++ b/internal/handlers/openapi.go
@@ -882,10 +882,11 @@ const openAPISpec = `{
       "delete": {
         "summary": "Delete a resource",
         "security": [{ "bearerAuth": [] }],
-        "parameters": [{ "name": "id", "in": "path", "required": true, "schema": { "type": "string", "format": "uuid" } }],
+        "parameters": [{ "name": "id", "in": "path", "required": true, "schema": { "type": "string", "format": "uuid" }, "description": "Accepts EITHER the resource's 'id' (as returned by GET /api/v1/resources) OR its provision 'token'. Resolution tries token first, then id; authorization (team ownership) is identical for both forms." }],
         "responses": {
           "200": { "description": "Resource deleted" },
-          "403": { "description": "Forbidden — not your resource OR blocked by team env_policy. The env_policy variant carries body: { error: 'env_policy_denied', env, action, role, allowed_roles, agent_action }." }
+          "403": { "description": "Forbidden — not your resource OR blocked by team env_policy. The env_policy variant carries body: { error: 'env_policy_denied', env, action, role, allowed_roles, agent_action }." },
+          "404": { "description": "Not found — no resource with that id or token, or it belongs to another team." }
         }
       }
     },
diff --git a/internal/handlers/provision_helper.go b/internal/handlers/provision_helper.go
index acc8bc07..1697f29a 100644
--- a/internal/handlers/provision_helper.go
+++ b/internal/handlers/provision_helper.go
@@ -430,7 +430,9 @@ var errProvisionPersistFailed = errors.New("provision persistence failed")
 // this helper:
 //
 //   - runs the caller-supplied cleanup closure (best-effort backend deprovision),
-//   - soft-deletes the resource row,
+//   - marks the resource row 'failed' (a pollable terminal state — see
+//     models.MarkResourceFailed; failed rows are list-visible, deletable,
+//     and quota-exempt),
 //   - returns errProvisionPersistFailed.
 //
 // The caller treats a non-nil return as a hard provision failure
@@ -505,12 +507,14 @@ func (h *provisionHelper) finalizeProvision(
 	}
 
 	// Persistence failed — the resource is unreachable / un-addressable. Tear
-	// down the backend object (best-effort) and soft-delete the row so the
-	// platform is not left billing an orphan, then signal a hard failure.
+	// down the backend object (best-effort) and mark the row 'failed' so the
+	// caller has a pollable terminal state and the platform is not left
+	// billing an orphan (quota counts filter status='active'), then signal a
+	// hard failure.
 	if cleanup != nil {
 		cleanup()
 	}
-	if delErr := models.SoftDeleteResource(ctx, h.db, resource.ID); delErr != nil {
+	if delErr := models.MarkResourceFailed(ctx, h.db, resource.ID); delErr != nil {
 		slog.Error(logPrefix+".cleanup_soft_delete_failed", "error", delErr,
 			"resource_id", resource.ID, "request_id", requestID)
 	}
@@ -518,7 +522,7 @@ func (h *provisionHelper) finalizeProvision(
 	// MR-P0-3: emit the operator-alert audit row. This is the moment the
 	// platform produced an unreachable resource — the backend object existed
 	// (briefly), the platform DB could not address it, and the deprovision +
-	// soft-delete is the compensation. Operators key on this kind to
+	// 'failed' mark is the compensation. Operators key on this kind to
 	// reconstruct the exact request, find the upstream failure cause
 	// (DB unreachable / encrypt failure / etc.), and audit-trace any backend
 	// objects that escaped the best-effort cleanup. Best-effort emit: audit-
diff --git a/internal/handlers/provision_softdelete_final2_test.go b/internal/handlers/provision_softdelete_final2_test.go
index 79f6285f..cc5b27b3 100644
--- a/internal/handlers/provision_softdelete_final2_test.go
+++ b/internal/handlers/provision_softdelete_final2_test.go
@@ -19,6 +19,7 @@ package handlers_test
 // caps). A unique IP per call keeps the rate-limit/fingerprint counters clean.
 
 import (
+	"context"
 	"database/sql"
 	"encoding/json"
 	"errors"
@@ -37,6 +38,7 @@ import (
 	"instant.dev/internal/config"
 	"instant.dev/internal/handlers"
 	"instant.dev/internal/middleware"
+	"instant.dev/internal/models"
 	"instant.dev/internal/plans"
 	"instant.dev/internal/testhelpers"
 )
@@ -114,7 +116,9 @@ func postBadBackend(t *testing.T, app *fiber.App, path, jwt, ip string) (int, st
 }
 
 // TestProvisionFinal2_BackendFailure_SoftDelete_Auth covers the authenticated
-// backend-failure soft-delete arms in db.go (L424) and nosql.go.
+// backend-failure rollback arms in db.go (L424) and nosql.go. Post Wave-2 A1
+// the rollback persists the row as status='failed' (a pollable terminal
+// state) instead of soft-deleting it — asserted per resource type below.
 func TestProvisionFinal2_BackendFailure_SoftDelete_Auth(t *testing.T) {
 	if os.Getenv("TEST_DATABASE_URL") == "" {
 		t.Skip("TEST_DATABASE_URL not set")
@@ -123,10 +127,25 @@ func TestProvisionFinal2_BackendFailure_SoftDelete_Auth(t *testing.T) {
 	teamID := testhelpers.MustCreateTeamDB(t, liveDB, "pro")
 	jwt := authSessionJWT(t, liveDB, teamID)
 
-	for i, path := range []string{"/db/new", "/nosql/new", "/vector/new", "/queue/new"} {
-		status, errCode := postBadBackend(t, app, path, jwt, "10.220."+digitStr(i)+".7")
-		assert.Equalf(t, http.StatusServiceUnavailable, status, "%s backend-fail must 503", path)
-		assert.Equalf(t, "provision_failed", errCode, "%s error code", path)
+	cases := []struct{ path, resourceType string }{
+		{"/db/new", "postgres"},
+		{"/nosql/new", "mongodb"},
+		{"/vector/new", "vector"},
+		{"/queue/new", "queue"},
+	}
+	for i, tc := range cases {
+		status, errCode := postBadBackend(t, app, tc.path, jwt, "10.220."+digitStr(i)+".7")
+		assert.Equalf(t, http.StatusServiceUnavailable, status, "%s backend-fail must 503", tc.path)
+		assert.Equalf(t, "provision_failed", errCode, "%s error code", tc.path)
+
+		// Wave-2 A1: the rolled-back row must persist as 'failed', not vanish.
+		var rowStatus string
+		require.NoErrorf(t, liveDB.QueryRowContext(context.Background(),
+			`SELECT status FROM resources WHERE team_id = $1::uuid AND resource_type = $2
+			 ORDER BY created_at DESC LIMIT 1`, teamID, tc.resourceType).Scan(&rowStatus),
+			"%s rollback row must exist", tc.path)
+		assert.Equalf(t, models.StatusFailed, rowStatus,
+			"%s rollback row must be status='failed' (pollable terminal state)", tc.path)
 	}
 }
 
diff --git a/internal/handlers/queue.go b/internal/handlers/queue.go
index af4cd126..ca443f12 100644
--- a/internal/handlers/queue.go
+++ b/internal/handlers/queue.go
@@ -297,8 +297,9 @@ func (h *QueueHandler) NewQueue(c *fiber.Ctx) error {
 		middleware.RecordProvisionFail("queue", middleware.ProvisionFailBackendUnavailable)
 		slog.Error("queue.new.provision_failed",
 			"error", err, "token", tokenStr, "request_id", requestID)
-		// Soft-delete the resource record so limits aren't falsely consumed.
-		if delErr := models.SoftDeleteResource(ctx, h.db, resource.ID); delErr != nil {
+		// Mark the pending row 'failed' — a pollable terminal state. Failed
+		// rows never count against quota (counts filter status='active').
+		if delErr := models.MarkResourceFailed(ctx, h.db, resource.ID); delErr != nil {
 			slog.Error("queue.new.soft_delete_failed", "error", delErr, "resource_id", resource.ID)
 		}
 		return respondProvisionFailed(c, err, "Failed to provision NATS credentials")
@@ -523,7 +524,7 @@ func (h *QueueHandler) newQueueAuthenticated(
 		middleware.RecordProvisionFail("queue", middleware.ProvisionFailBackendUnavailable)
 		slog.Error("queue.new.provision_failed_auth",
 			"error", err, "token", tokenStr, "team_id", teamIDStr, "request_id", requestID)
-		if delErr := models.SoftDeleteResource(ctx, h.db, resource.ID); delErr != nil {
+		if delErr := models.MarkResourceFailed(ctx, h.db, resource.ID); delErr != nil {
 			slog.Error("queue.new.soft_delete_failed_auth", "error", delErr, "resource_id", resource.ID)
 		}
 		return respondProvisionFailed(c, err, "Failed to provision NATS credentials")
diff --git a/internal/handlers/resource.go b/internal/handlers/resource.go
index 88c58c1e..9cf9326b 100644
--- a/internal/handlers/resource.go
+++ b/internal/handlers/resource.go
@@ -181,7 +181,29 @@ func (h *ResourceHandler) Get(c *fiber.Ctx) error {
 	})
 }
 
+// resolveResourceByTokenOrID resolves a path UUID first as a provision token
+// (the historical DELETE contract), then — when no token matches — as the
+// row's primary-key id. GET /api/v1/resources keys its items by `id`, so
+// before this fallback the natural list→delete-by-id flow 404'd 100% of the
+// time (Wave-2 A1). Token resolution stays first so existing token-addressed
+// callers see byte-identical behaviour; a collision between one row's token
+// and another row's id is a 2^-122 non-event.
+func resolveResourceByTokenOrID(ctx context.Context, db *sql.DB, pathUUID uuid.UUID) (*models.Resource, error) {
+	resource, err := models.GetResourceByToken(ctx, db, pathUUID)
+	if err != nil {
+		var notFound *models.ErrResourceNotFound
+		if errors.As(err, &notFound) {
+			return models.GetResourceByID(ctx, db, pathUUID)
+		}
+	}
+	return resource, err
+}
+
 // Delete handles DELETE /api/v1/resources/:id — soft-deletes a resource.
+// :id accepts EITHER the resource's provision token (historical contract)
+// OR its `id` as returned by GET /api/v1/resources (see
+// resolveResourceByTokenOrID). Authorization is identical for both forms:
+// the row must belong to the caller's team.
 func (h *ResourceHandler) Delete(c *fiber.Ctx) error {
 	requestID := middleware.GetRequestID(c)
 
@@ -191,12 +213,12 @@ func (h *ResourceHandler) Delete(c *fiber.Ctx) error {
 	}
 
 	tokenStr := c.Params("id")
-	token, parseErr := uuid.Parse(tokenStr)
+	pathUUID, parseErr := uuid.Parse(tokenStr)
 	if parseErr != nil {
 		return respondError(c, fiber.StatusBadRequest, "invalid_id", "Resource ID must be a valid UUID")
 	}
 
-	resource, err := models.GetResourceByToken(c.Context(), h.db, token)
+	resource, err := resolveResourceByTokenOrID(c.Context(), h.db, pathUUID)
 	if err != nil {
 		var notFound *models.ErrResourceNotFound
 		if errors.As(err, &notFound) {
@@ -210,6 +232,11 @@ func (h *ResourceHandler) Delete(c *fiber.Ctx) error {
 		return respondError(c, fiber.StatusServiceUnavailable, "fetch_failed", "Failed to fetch resource")
 	}
 
+	// token is the resource's TOKEN, not the raw path UUID — under the
+	// id-addressed form the path param is the row id, and every backend
+	// object name / cache key / audit summary derives from the token.
+	token := resource.Token
+
 	if !resource.TeamID.Valid || resource.TeamID.UUID != teamID {
 		// 404 not 403: never confirm the existence of resources owned by
 		// other teams (or unclaimed anonymous resources).
@@ -298,8 +325,9 @@ func (h *ResourceHandler) Delete(c *fiber.Ctx) error {
 
 	// Invalidate the cached resource entry so any in-flight requests
 	// see the deletion immediately rather than waiting for TTL expiry.
-	// Use token.String() (normalized lowercase UUID) to match the key written by
-	// getResourceCached — tokenStr is the raw URL param and may be mixed-case.
+	// Use token.String() (normalized lowercase TOKEN UUID) to match the key
+	// written by getResourceCached — tokenStr is the raw URL param, may be
+	// mixed-case, and under the id-addressed form isn't the token at all.
 	h.rdb.Del(c.Context(), fmt.Sprintf("res:%s", token.String()))
 
 	slog.Info("resource.deleted",
diff --git a/internal/handlers/resource_delete_by_id_test.go b/internal/handlers/resource_delete_by_id_test.go
new file mode 100644
index 00000000..7dd9666e
--- /dev/null
+++ b/internal/handlers/resource_delete_by_id_test.go
@@ -0,0 +1,271 @@
+package handlers_test
+
+// resource_delete_by_id_test.go — Wave-2 A1 behavioural coverage:
+//
+//  1. DELETE /api/v1/resources/:id accepts BOTH address forms — the row `id`
+//     (as returned by GET /api/v1/resources) and the provision `token`. The
+//     id form used to 404 100% of the time, breaking the natural
+//     list→delete-by-id flow.
+//  2. Provision-rollback rows persist as status='failed' (a pollable terminal
+//     state): visible in lists, deletable, never counted against quota.
+//
+// Uses the bufconn fake-provisioner fixture from
+// coverage_provisioner_grpc_test.go so the deprovision RPC is observable.
+
+import (
+	"context"
+	"encoding/json"
+	"io"
+	"net/http"
+	"net/http/httptest"
+	"testing"
+	"time"
+
+	"github.com/google/uuid"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+
+	"instant.dev/internal/handlers"
+	"instant.dev/internal/models"
+	"instant.dev/internal/plans"
+	"instant.dev/internal/testhelpers"
+)
+
+// decodeInto unmarshals a response body into out (best-effort: an empty body
+// leaves out zero-valued).
+func decodeInto(t *testing.T, resp *http.Response, out any) {
+	t.Helper()
+	raw, err := io.ReadAll(resp.Body)
+	require.NoError(t, err)
+	if len(raw) > 0 {
+		require.NoError(t, json.Unmarshal(raw, out))
+	}
+}
+
+// mustParseUUID parses s or fails the test.
+func mustParseUUID(t *testing.T, s string) uuid.UUID {
+	t.Helper()
+	id, err := uuid.Parse(s)
+	require.NoError(t, err)
+	return id
+}
+
+// doDelete issues DELETE /api/v1/resources/{ref} with the given JWT.
+func doDelete(t *testing.T, fx grpcProvFixture, ref, jwt string) (*http.Response, grpcProvResp) {
+	t.Helper()
+	req := httptest.NewRequest(http.MethodDelete, "/api/v1/resources/"+ref, nil)
+	req.Header.Set("X-Forwarded-For", "10.70.0.1")
+	req.Header.Set("Authorization", "Bearer "+jwt)
+	resp, err := fx.app.Test(req, 15000)
+	require.NoError(t, err)
+	var parsed grpcProvResp
+	decodeInto(t, resp, &parsed)
+	return resp, parsed
+}
+
+// resourceStatus reads the row's current status.
+func resourceStatus(t *testing.T, fx grpcProvFixture, id string) string {
+	t.Helper()
+	var status string
+	require.NoError(t, fx.db.QueryRowContext(context.Background(),
+		`SELECT status FROM resources WHERE id = $1::uuid`, id).Scan(&status))
+	return status
+}
+
+// TestResourceDelete_ByRowID_Succeeds is the headline fix: the id returned by
+// the list endpoint is accepted by DELETE, with the deprovision RPC still
+// addressed by the resource's TOKEN (backend object names derive from it).
+func TestResourceDelete_ByRowID_Succeeds(t *testing.T) {
+	fake := &fakeProvisioner{}
+	fx := setupGRPCProvFixture(t, fake, false)
+	teamID := testhelpers.MustCreateTeamDB(t, fx.db, "pro")
+	jwt := authSessionJWT(t, fx.db, teamID)
+	id, token := seedSourceResource(t, fx.db, teamID, "postgres", "pro", "production")
+
+	resp, _ := doDelete(t, fx, id, jwt)
+	defer resp.Body.Close()
+	require.Equal(t, http.StatusOK, resp.StatusCode,
+		"DELETE by row id must succeed — the list endpoint keys rows by id")
+
+	assert.Equal(t, "deleted", resourceStatus(t, fx, id))
+
+	// The destructive deprovision must target the TOKEN, not the path id.
+	require.GreaterOrEqual(t, fake.deprovisionCount(), 1, "deprovision RPC must fire")
+	require.NotNil(t, fake.lastDeprovision())
+	assert.Equal(t, token, fake.lastDeprovision().GetToken(),
+		"deprovision must be addressed by resource.Token even when the path param is the row id")
+}
+
+// TestResourceDelete_ByToken_StillWorks pins the historical contract.
+func TestResourceDelete_ByToken_StillWorks(t *testing.T) {
+	fake := &fakeProvisioner{}
+	fx := setupGRPCProvFixture(t, fake, false)
+	teamID := testhelpers.MustCreateTeamDB(t, fx.db, "pro")
+	jwt := authSessionJWT(t, fx.db, teamID)
+	id, token := seedSourceResource(t, fx.db, teamID, "postgres", "pro", "production")
+
+	resp, _ := doDelete(t, fx, token, jwt)
+	defer resp.Body.Close()
+	require.Equal(t, http.StatusOK, resp.StatusCode)
+	assert.Equal(t, "deleted", resourceStatus(t, fx, id))
+}
+
+// TestResourceDelete_ByRowID_OtherTeam_404 — authorization is identical for
+// the id form: another team's row id resolves but must 404 (never confirm
+// existence).
+func TestResourceDelete_ByRowID_OtherTeam_404(t *testing.T) {
+	fake := &fakeProvisioner{}
+	fx := setupGRPCProvFixture(t, fake, false)
+	ownerTeam := testhelpers.MustCreateTeamDB(t, fx.db, "pro")
+	id, _ := seedSourceResource(t, fx.db, ownerTeam, "postgres", "pro", "production")
+
+	attackerTeam := testhelpers.MustCreateTeamDB(t, fx.db, "pro")
+	jwt := authSessionJWT(t, fx.db, attackerTeam)
+
+	resp, body := doDelete(t, fx, id, jwt)
+	defer resp.Body.Close()
+	require.Equal(t, http.StatusNotFound, resp.StatusCode)
+	assert.Equal(t, "not_found", body.Error)
+	assert.Equal(t, "active", resourceStatus(t, fx, id), "row must be untouched")
+	assert.Equal(t, 0, fake.deprovisionCount(), "no deprovision for an unauthorized delete")
+}
+
+// TestResourceDelete_UnknownUUID_404 — a UUID matching neither token nor id.
+func TestResourceDelete_UnknownUUID_404(t *testing.T) {
+	fake := &fakeProvisioner{}
+	fx := setupGRPCProvFixture(t, fake, false)
+	teamID := testhelpers.MustCreateTeamDB(t, fx.db, "pro")
+	jwt := authSessionJWT(t, fx.db, teamID)
+
+	resp, body := doDelete(t, fx, "00000000-0000-4000-8000-00000000dead", jwt)
+	defer resp.Body.Close()
+	require.Equal(t, http.StatusNotFound, resp.StatusCode)
+	assert.Equal(t, "not_found", body.Error)
+}
+
+// TestProvisionRollback_PersistsFailedRow_ListedAndDeletable drives the full
+// Wave-2 A1 lifecycle: a failed provision leaves a status='failed' row that
+// (a) the caller can poll, (b) appears in GET /api/v1/resources, (c) does NOT
+// count against the per-type quota, and (d) is deletable by row id.
+func TestProvisionRollback_PersistsFailedRow_ListedAndDeletable(t *testing.T) {
+	fake := &fakeProvisioner{failProvision: true}
+	fx := setupGRPCProvFixture(t, fake, false)
+	teamID := testhelpers.MustCreateTeamDB(t, fx.db, "pro")
+	jwt := authSessionJWT(t, fx.db, teamID)
+
+	resp, body := doProvision(t, fx, "/db/new", "10.71.0.1", jwt, map[string]any{"name": "rollback-failed-row"})
+	resp.Body.Close()
+	require.Equal(t, http.StatusServiceUnavailable, resp.StatusCode)
+	require.Equal(t, "provision_failed", body.Error)
+
+	// (a) The row persists in the terminal 'failed' state — it must NOT have
+	// vanished (the pre-A1 soft-delete made a timed-out provision unpollable).
+	var id, status string
+	require.NoError(t, fx.db.QueryRowContext(context.Background(),
+		`SELECT id::text, status FROM resources WHERE team_id = $1::uuid AND resource_type = 'postgres'
+		 ORDER BY created_at DESC LIMIT 1`, teamID).Scan(&id, &status))
+	require.Equal(t, models.StatusFailed, status,
+		"provision rollback must persist a 'failed' row, not delete it")
+
+	// (b) Visible in the team list (only 'deleted' rows are hidden).
+	listReq := httptest.NewRequest(http.MethodGet, "/api/v1/resources", nil)
+	listReq.Header.Set("Authorization", "Bearer "+jwt)
+	listResp, err := fx.app.Test(listReq, 15000)
+	require.NoError(t, err)
+	defer listResp.Body.Close()
+	require.Equal(t, http.StatusOK, listResp.StatusCode)
+	var list struct {
+		Items []struct {
+			ID     string `json:"id"`
+			Status string `json:"status"`
+		} `json:"items"`
+	}
+	decodeInto(t, listResp, &list)
+	found := false
+	for _, it := range list.Items {
+		if it.ID == id {
+			found = true
+			assert.Equal(t, models.StatusFailed, it.Status)
+		}
+	}
+	assert.True(t, found, "failed row must appear in GET /api/v1/resources")
+
+	// (c) Quota-exempt: failed rows never count toward the per-type cap.
+	n, err := models.CountActiveResourcesByTeamAndType(context.Background(), fx.db,
+		mustParseUUID(t, teamID), "postgres")
+	require.NoError(t, err)
+	assert.Equal(t, 0, n, "a failed row must not count against the plan quota")
+
+	// (d) Deletable by row id — terminal cleanup stays in the user's hands.
+	delResp, _ := doDelete(t, fx, id, jwt)
+	defer delResp.Body.Close()
+	require.Equal(t, http.StatusOK, delResp.StatusCode, "failed rows must be deletable")
+	assert.Equal(t, "deleted", resourceStatus(t, fx, id))
+}
+
+// TestProvisionRollback_Cache_Authenticated_FailedRow covers the
+// authenticated /cache/new rollback arm (cache.go) — the one provision-fail
+// arm the pre-A1 suites didn't reach — and asserts the new terminal state.
+func TestProvisionRollback_Cache_Authenticated_FailedRow(t *testing.T) {
+	fake := &fakeProvisioner{failProvision: true}
+	fx := setupGRPCProvFixture(t, fake, false)
+	teamID := testhelpers.MustCreateTeamDB(t, fx.db, "pro")
+	jwt := authSessionJWT(t, fx.db, teamID)
+
+	resp, body := doProvision(t, fx, "/cache/new", "10.72.0.1", jwt, map[string]any{"name": "cache-auth-fail"})
+	defer resp.Body.Close()
+	require.Equal(t, http.StatusServiceUnavailable, resp.StatusCode)
+	require.Equal(t, "provision_failed", body.Error)
+
+	var status string
+	require.NoError(t, fx.db.QueryRowContext(context.Background(),
+		`SELECT status FROM resources WHERE team_id = $1::uuid AND resource_type = 'redis'
+		 ORDER BY created_at DESC LIMIT 1`, teamID).Scan(&status))
+	assert.Equal(t, models.StatusFailed, status)
+}
+
+// TestResourceDelete_Storage_ByRowID_AuditUsesToken drives the storage
+// deprovision arm with a SUCCESSFUL revoke (hermetic minio-admin fake) under
+// the id-addressed form, proving the IAM-removal audit row is keyed by the
+// resource TOKEN (key_<token[:8]>), not the path id.
+func TestResourceDelete_Storage_ByRowID_AuditUsesToken(t *testing.T) {
+	db, clean := testhelpers.SetupTestDB(t)
+	defer clean()
+	rdb, rClean := testhelpers.SetupTestRedis(t)
+	defer rClean()
+	teamID := testhelpers.MustCreateTeamDB(t, db, "pro")
+	userID := uuid.NewString()
+
+	h := handlers.NewResourceHandler(db, rdb, resourceResidualConfig(), plans.Default(), nil, prefixScopedProvider(t))
+	app := resourceResidualApp(t, db, rdb, h, teamID, userID)
+
+	token := seedTeamResource(t, db, teamID, "storage", "active")
+	var rowID string
+	require.NoError(t, db.QueryRowContext(context.Background(),
+		`SELECT id::text FROM resources WHERE token = $1`, token).Scan(&rowID))
+
+	req := httptest.NewRequest(http.MethodDelete, "/api/v1/resources/"+rowID, nil)
+	resp, err := app.Test(req, 10000)
+	require.NoError(t, err)
+	defer resp.Body.Close()
+	require.Equal(t, http.StatusOK, resp.StatusCode, "storage delete by row id must succeed")
+
+	// The IAM audit emit is async (safego.Go) — poll briefly.
+	wantFragment := "key_" + token[:8]
+	deadline := time.Now().Add(3 * time.Second)
+	for {
+		var summary string
+		err := db.QueryRowContext(context.Background(),
+			`SELECT summary FROM audit_log WHERE team_id = $1::uuid AND kind = $2
+			 ORDER BY created_at DESC LIMIT 1`, teamID, models.AuditKindStorageIAMUserDeleted).Scan(&summary)
+		if err == nil {
+			assert.Contains(t, summary, wantFragment,
+				"IAM audit summary must reference the TOKEN-derived key, not the path id")
+			return
+		}
+		if time.Now().After(deadline) {
+			t.Fatalf("storage IAM audit row never appeared: %v", err)
+		}
+		time.Sleep(50 * time.Millisecond)
+	}
+}
diff --git a/internal/handlers/resource_env_mw_final3_test.go b/internal/handlers/resource_env_mw_final3_test.go
index 6adefefc..1c3b699f 100644
--- a/internal/handlers/resource_env_mw_final3_test.go
+++ b/internal/handlers/resource_env_mw_final3_test.go
@@ -1,7 +1,7 @@
 package handlers_test
 
 // resource_env_mw_final3_test.go — FINAL serial pass #3. Covers all three arms
-// of ResourceEnvByTokenForMiddleware (env_policy_helpers.go):
+// of ResourceEnvByTokenOrIDForMiddleware (env_policy_helpers.go):
 //   - non-UUID :id → ("", nil)               (line 32)
 //   - token not found → ("", nil) fail-open   (line 38)
 //   - real resource → (env, nil) happy        (line 40)
@@ -27,15 +27,15 @@ func TestResourceEnvByTokenMWFinal3_Arms(t *testing.T) {
 	teamID := testhelpers.MustCreateTeamDB(t, db, "pro")
 
 	// Seed a real resource so the happy arm returns its env.
-	var token string
+	var rowID, token string
 	require.NoError(t, db.QueryRowContext(context.Background(),
 		`INSERT INTO resources (team_id, resource_type, tier, env, status)
 		 VALUES ($1::uuid, 'postgres', 'pro', 'staging', 'active')
-		 RETURNING token::text`, teamID).Scan(&token))
+		 RETURNING id::text, token::text`, teamID).Scan(&rowID, &token))
 
 	app := fiber.New()
 	app.Get("/r/:id", func(c *fiber.Ctx) error {
-		env, err := handlers.ResourceEnvByTokenForMiddleware(c, db)
+		env, err := handlers.ResourceEnvByTokenOrIDForMiddleware(c, db)
 		if err != nil {
 			return c.Status(http.StatusInternalServerError).SendString("err")
 		}
@@ -59,7 +59,12 @@ func TestResourceEnvByTokenMWFinal3_Arms(t *testing.T) {
 	_, body = get(uuid.NewString())
 	assert.Empty(t, body)
 
-	// real resource → its env (line 40).
+	// real resource by token → its env (line 40).
 	_, body = get(token)
 	assert.Equal(t, "staging", body)
+
+	// real resource by ROW ID → its env (Wave-2 A1: the id-addressed DELETE
+	// form must hit the same env-policy gate as the token form).
+	_, body = get(rowID)
+	assert.Equal(t, "staging", body)
 }
diff --git a/internal/handlers/resourcestatus_conversion_test.go b/internal/handlers/resourcestatus_conversion_test.go
index 2289e656..11f41b81 100644
--- a/internal/handlers/resourcestatus_conversion_test.go
+++ b/internal/handlers/resourcestatus_conversion_test.go
@@ -29,7 +29,7 @@ import (
 // allRawStatuses is the set of status strings the resources table can
 // hold plus a junk value, so the equivalence check exercises the
 // unrecognised-value path too.
-var allRawStatuses = []string{"active", "paused", "suspended", "expired", "deleted", "garbage"}
+var allRawStatuses = []string{"active", "paused", "suspended", "failed", "expired", "deleted", "garbage"}
 
 // TestPauseStatusPredicate_EquivalentToOldLiterals covers resource.go
 // Pause: old code rejected with "already_paused" on Status == "paused"
diff --git a/internal/handlers/storage.go b/internal/handlers/storage.go
index 7b07578d..c8c3ba81 100644
--- a/internal/handlers/storage.go
+++ b/internal/handlers/storage.go
@@ -328,7 +328,7 @@ func (h *StorageHandler) NewStorage(c *fiber.Ctx) error {
 		middleware.RecordProvisionFail("storage", middleware.ProvisionFailBackendUnavailable)
 		slog.Error("storage.new.provision_failed",
 			"error", err, "token", tokenStr, "request_id", requestID)
-		if delErr := models.SoftDeleteResource(ctx, h.db, resource.ID); delErr != nil {
+		if delErr := models.MarkResourceFailed(ctx, h.db, resource.ID); delErr != nil {
 			slog.Error("storage.new.soft_delete_failed", "error", delErr, "resource_id", resource.ID)
 		}
 		return respondError(c, fiber.StatusServiceUnavailable, "provision_failed", "Failed to provision storage credentials")
@@ -528,7 +528,7 @@ func (h *StorageHandler) newStorageAuthenticated(
 		middleware.RecordProvisionFail("storage", middleware.ProvisionFailBackendUnavailable)
 		slog.Error("storage.new.provision_failed_auth",
 			"error", err, "token", tokenStr, "team_id", teamIDStr, "request_id", requestID)
-		if delErr := models.SoftDeleteResource(ctx, h.db, resource.ID); delErr != nil {
+		if delErr := models.MarkResourceFailed(ctx, h.db, resource.ID); delErr != nil {
 			slog.Error("storage.new.soft_delete_failed_auth", "error", delErr, "resource_id", resource.ID)
 		}
 		return respondError(c, fiber.StatusServiceUnavailable, "provision_failed", "Failed to provision storage credentials")
diff --git a/internal/handlers/vector.go b/internal/handlers/vector.go
index 71365929..b67da705 100644
--- a/internal/handlers/vector.go
+++ b/internal/handlers/vector.go
@@ -360,7 +360,7 @@ func (h *VectorHandler) NewVector(c *fiber.Ctx) error {
 		metrics.ProvisionFailures.WithLabelValues(models.ResourceTypeVector, "grpc_error").Inc()
 		slog.Error("vector.new.provision_failed",
 			"error", err, "token", tokenStr, "request_id", requestID)
-		if delErr := models.SoftDeleteResource(ctx, h.db, resource.ID); delErr != nil {
+		if delErr := models.MarkResourceFailed(ctx, h.db, resource.ID); delErr != nil {
 			slog.Error("vector.new.soft_delete_failed", "error", delErr, "resource_id", resource.ID)
 		}
 		return respondProvisionFailed(c, err, "Failed to provision vector database")
@@ -520,7 +520,7 @@ func (h *VectorHandler) newVectorAuthenticated(
 		metrics.ProvisionFailures.WithLabelValues(models.ResourceTypeVector, "grpc_error").Inc()
 		slog.Error("vector.new.provision_failed_auth",
 			"error", err, "token", tokenStr, "team_id", teamIDStr, "request_id", requestID)
-		if delErr := models.SoftDeleteResource(ctx, h.db, resource.ID); delErr != nil {
+		if delErr := models.MarkResourceFailed(ctx, h.db, resource.ID); delErr != nil {
 			slog.Error("vector.new.soft_delete_failed_auth", "error", delErr, "resource_id", resource.ID)
 		}
 		return respondProvisionFailed(c, err, "Failed to provision vector database")
diff --git a/internal/models/coverage_resource_test.go b/internal/models/coverage_resource_test.go
index f53c96c5..c1b1f98c 100644
--- a/internal/models/coverage_resource_test.go
+++ b/internal/models/coverage_resource_test.go
@@ -73,6 +73,22 @@ func TestMarkResourceActive_Branches(t *testing.T) {
 	require.ErrorContains(t, MarkResourceActive(ctx, db3, uuid.New()), "boom")
 }
 
+func TestMarkResourceFailed_Branches(t *testing.T) {
+	ctx := context.Background()
+	db, mock := newMock(t)
+	mock.ExpectExec(`UPDATE resources SET status = 'failed'`).WillReturnResult(sqlmock.NewResult(0, 1))
+	require.NoError(t, MarkResourceFailed(ctx, db, uuid.New()))
+
+	// Zero rows — the row was already moved out of 'pending' by another path.
+	db2, mock2 := newMock(t)
+	mock2.ExpectExec(`UPDATE resources SET status = 'failed'`).WillReturnResult(sqlmock.NewResult(0, 0))
+	require.ErrorIs(t, MarkResourceFailed(ctx, db2, uuid.New()), ErrResourceNotPending)
+
+	db3, mock3 := newMock(t)
+	mock3.ExpectExec(`UPDATE resources SET status = 'failed'`).WillReturnError(errors.New("boom"))
+	require.ErrorContains(t, MarkResourceFailed(ctx, db3, uuid.New()), "boom")
+}
+
 func TestCountActiveResourcesByTeamAndType_Branches(t *testing.T) {
 	ctx := context.Background()
 	db, mock := newMock(t)
@@ -223,17 +239,6 @@ func TestSetWebhookHMACSecret_Branches(t *testing.T) {
 	require.ErrorContains(t, SetWebhookHMACSecret(ctx, db3, uuid.New(), "x"), "boom")
 }
 
-func TestSoftDeleteResource_Branches(t *testing.T) {
-	ctx := context.Background()
-	db, mock := newMock(t)
-	mock.ExpectExec(`UPDATE resources SET status = 'deleted'`).WillReturnResult(sqlmock.NewResult(0, 1))
-	require.NoError(t, SoftDeleteResource(ctx, db, uuid.New()))
-
-	db2, mock2 := newMock(t)
-	mock2.ExpectExec(`UPDATE resources SET status = 'deleted'`).WillReturnError(errors.New("boom"))
-	require.ErrorContains(t, SoftDeleteResource(ctx, db2, uuid.New()), "boom")
-}
-
 func TestSoftDeleteResourceIfActive_Branches(t *testing.T) {
 	ctx := context.Background()
 
diff --git a/internal/models/resource.go b/internal/models/resource.go
index ff03f99a..0dd8ed23 100644
--- a/internal/models/resource.go
+++ b/internal/models/resource.go
@@ -197,6 +197,17 @@ const StatusPending = "pending"
 // StatusActive is the canonical "provisioned and usable" status.
 const StatusActive = "active"
 
+// StatusFailed is the terminal status a resource row carries when the backend
+// provision RPC failed or timed out and the handler rolled the provision back.
+// Before Wave-2 A1 (2026-06-11) the rollback soft-deleted the row, so a
+// timed-out provision VANISHED from every read surface and the caller had no
+// terminal state to poll. A failed row is visible in lists (only 'deleted' is
+// hidden), deletable via DELETE /api/v1/resources/:id, exempt from quota
+// counts (those filter status='active'), and carries no live backing infra —
+// the backend object either never existed or was torn down by the rollback's
+// best-effort cleanup. Added by migration 070_resources_failed_status.sql.
+const StatusFailed = "failed"
+
 // CreateResource inserts a new resource row and returns it.
 //
 // MR-P0-2 (BugBash 2026-05-20): the row is inserted with status='pending', NOT
@@ -271,6 +282,31 @@ func MarkResourceActive(ctx context.Context, db *sql.DB, id uuid.UUID) error {
 // that is not in the expected mid-provision state.
 var ErrResourceNotPending = fmt.Errorf("models: resource is not pending")
 
+// MarkResourceFailed flips a resource from 'pending' → 'failed'. It is the
+// provision-rollback counterpart of MarkResourceActive: the handler runs it
+// when the backend provision RPC fails/times out, or when a post-RPC
+// persistence step fails (finalizeProvision), so the caller is left with a
+// pollable terminal row instead of a vanished one.
+//
+// The atomic `WHERE status = 'pending'` guard mirrors MarkResourceActive: a
+// row some other path already moved out of 'pending' (a reconciler abandon, a
+// concurrent delete) is NOT clobbered. Returns ErrResourceNotPending on a
+// zero-row update so the caller can log the unexpected state; rollback callers
+// treat that as best-effort (the 503 to the customer is unchanged either way).
+func MarkResourceFailed(ctx context.Context, db *sql.DB, id uuid.UUID) error {
+	res, err := db.ExecContext(ctx, `
+		UPDATE resources SET status = 'failed' WHERE id = $1 AND status = 'pending'
+	`, id)
+	if err != nil {
+		return fmt.Errorf("models.MarkResourceFailed: %w", err)
+	}
+	n, _ := res.RowsAffected()
+	if n == 0 {
+		return ErrResourceNotPending
+	}
+	return nil
+}
+
 // CountActiveResourcesByTeamAndType returns the number of active (non-deleted)
 // resources of the given type owned by a team. Used for plan limit enforcement.
 // Counts across ALL environments — plan limits apply per team, not per env.
@@ -454,19 +490,6 @@ func SetWebhookHMACSecret(ctx context.Context, db *sql.DB, resourceID uuid.UUID,
 	return nil
 }
 
-// SoftDeleteResource marks a resource status as 'deleted'. Used by the
-// provision-rollback paths (db/cache/nosql/queue/storage/vector), which always
-// operate on a row they just created — they don't need the concurrency guard.
-func SoftDeleteResource(ctx context.Context, db *sql.DB, id uuid.UUID) error {
-	_, err := db.ExecContext(ctx, `
-		UPDATE resources SET status = 'deleted' WHERE id = $1
-	`, id)
-	if err != nil {
-		return fmt.Errorf("models.SoftDeleteResource: %w", err)
-	}
-	return nil
-}
-
 // SoftDeleteResourceIfActive flips status to 'deleted' ONLY when the row is not
 // already deleted, returning whether THIS call performed the transition.
 //
diff --git a/internal/provisioner/client.go b/internal/provisioner/client.go
index f0cb5be5..ed3f6fb7 100644
--- a/internal/provisioner/client.go
+++ b/internal/provisioner/client.go
@@ -33,27 +33,40 @@ const (
 	provisionerCircuitCooldown  = 30 * time.Second
 )
 
+// dedicatedProvisionReadyWait mirrors the provisioner's k8s-backend pod-ready
+// timeouts (redisK8sReadyTO / k8sReadyTimeout / natsK8sReadyTO /
+// mongoK8sReadyTO in provisioner/internal/backend/* — all 3 minutes): the
+// documented budget a dedicated-pod provision may legitimately spend waiting
+// for PVC bind + image pull + readiness. Every provision RPC deadline on this
+// client MUST be ≥ this value plus provisionRPCDeadlineMargin — an RPC
+// deadline that undercuts the backend's own ready-wait kills a provision that
+// was still on track to succeed (and, post Wave-2 A1, strands a 'failed' row
+// for a resource the provisioner then finishes bringing up).
+// TestProvisionTimeouts_NeverUndercutDedicatedReadyWait enforces the
+// invariant for every tier and every provision deadline in this file.
+const dedicatedProvisionReadyWait = 3 * time.Minute
+
+// provisionRPCDeadlineMargin is the headroom an RPC deadline carries on top
+// of dedicatedProvisionReadyWait, covering the non-pod-wait parts of the
+// provision (credential mint, ACL setup, network round-trips).
+const provisionRPCDeadlineMargin = 30 * time.Second
+
 // cacheProvisionTimeout bounds the api → provisioner ProvisionCache RPC.
 //
-// Unlike Postgres/Mongo, a Redis namespace provision does NOT spin up a
-// per-tenant pod with a PVC bind + image pull + DB init — it carves a
-// namespace/ACL out of an already-running Redis (observed ~1–6s end to
-// end, anonymous AND authenticated). It therefore does not need the
-// 4–5m cold-pod budget that provisionTimeout() grants the pod-backed
-// resource types.
-//
-// Granting Redis that same multi-minute budget means a *hung* provisioner
-// hangs the whole /cache/new request for up to 5 minutes (the symptom that
-// motivated this fix: authed /cache/new observed hanging >60s while anon
-// returned in ~6s). A generous-but-bounded 45s ceiling is ~7× the slowest
-// observed healthy provision, so a legitimately slow-but-OK Redis carve
-// still succeeds, while a genuine hang fails fast — the existing handler
-// error path then soft-deletes the pending resource and returns a clean
-// 503 provision_failed instead of an indefinite hang.
+// History: this was a 45s "fail fast" ceiling from when a Redis provision was
+// believed to always be a shared-instance namespace/ACL carve (~1–6s). In
+// production REDIS_PROVISION_BACKEND=k8s provisions a dedicated Redis pod
+// (and under tier-aware routing the Team tier always does), which the
+// provisioner legitimately waits up to dedicatedProvisionReadyWait (3m) for —
+// so the 45s RPC deadline undercut the backend's own ready-wait and killed
+// cold-pod provisions mid-flight. Reconciled (Wave-2 A1): the deadline is now
+// derived from the dedicated ready-wait plus margin, never below it. Hung-
+// provisioner protection comes from this still-bounded deadline plus the
+// shared circuit breaker (5 consecutive failures → ErrOpen in <1ms).
 //
 // A package var (not const) so the timeout→503 unit test can shrink it to
-// a few ms instead of blocking the suite for 45s; production never mutates it.
-var cacheProvisionTimeout = 45 * time.Second
+// a few ms instead of blocking the suite; production never mutates it.
+var cacheProvisionTimeout = dedicatedProvisionReadyWait + provisionRPCDeadlineMargin
 
 // Credentials matches the shape returned by local providers.
 type Credentials struct {
@@ -282,7 +295,9 @@ func (c *Client) ctxWithAuth(ctx context.Context) context.Context {
 // every-tier change). PVC bind + image pull + postgres init can take 30-90s on
 // a cold node, so 10s (the old anonymous default) drops the connection while
 // the pod is still coming up. Anonymous gets a tight 4m budget; pro/team get
-// 5m for larger images and bigger PVCs.
+// 5m for larger images and bigger PVCs. Both budgets honour the floor of
+// dedicatedProvisionReadyWait + provisionRPCDeadlineMargin (3m30s) — see the
+// invariant on dedicatedProvisionReadyWait.
 func provisionTimeout(tier string) time.Duration {
 	if tier == "pro" || tier == "team" || tier == "growth" {
 		return 5 * time.Minute
diff --git a/internal/provisioner/client_cov_test.go b/internal/provisioner/client_cov_test.go
index 76e64b81..47a76468 100644
--- a/internal/provisioner/client_cov_test.go
+++ b/internal/provisioner/client_cov_test.go
@@ -425,27 +425,47 @@ func TestProvisionCache_SuccessAndError(t *testing.T) {
 	}
 }
 
-// TestCacheProvisionTimeout_Value pins the production deadline so a future
-// edit that re-grants Redis the multi-minute cold-pod budget (re-introducing
-// the /cache/new hang) reds the suite.
+// TestCacheProvisionTimeout_Value pins the production deadline derivation:
+// the cache RPC deadline must equal the provisioner's dedicated pod-ready
+// budget plus margin. A bare literal here (the old 45s "Redis is always a
+// shared carve" ceiling) undercut the provisioner's 3m k8s ready-wait and
+// killed legitimate cold-pod provisions mid-flight (Wave-2 A1).
 func TestCacheProvisionTimeout_Value(t *testing.T) {
-	if cacheProvisionTimeout != 45*time.Second {
-		t.Errorf("cacheProvisionTimeout = %s; want 45s (generous-but-bounded Redis carve ceiling)", cacheProvisionTimeout)
-	}
-	// Must be well under provisionTimeout's pod-backed budget — that gap is
-	// the whole point: a hung provisioner fails /cache/new fast instead of
-	// hanging it for minutes.
-	if cacheProvisionTimeout >= provisionTimeout("pro") {
-		t.Errorf("cacheProvisionTimeout (%s) must be tighter than provisionTimeout(pro) (%s)",
+	if want := dedicatedProvisionReadyWait + provisionRPCDeadlineMargin; cacheProvisionTimeout != want {
+		t.Errorf("cacheProvisionTimeout = %s; want %s (dedicatedProvisionReadyWait + provisionRPCDeadlineMargin)",
+			cacheProvisionTimeout, want)
+	}
+	// Still bounded: must not balloon past provisionTimeout's pod-backed
+	// budget — a hung provisioner has to fail /cache/new within the same
+	// envelope as every other provision RPC.
+	if cacheProvisionTimeout > provisionTimeout("pro") {
+		t.Errorf("cacheProvisionTimeout (%s) must not exceed provisionTimeout(pro) (%s)",
 			cacheProvisionTimeout, provisionTimeout("pro"))
 	}
 }
 
+// TestProvisionTimeouts_NeverUndercutDedicatedReadyWait enforces the Wave-2 A1
+// invariant for EVERY provision deadline in this package: no RPC deadline may
+// undercut the provisioner's dedicated pod-ready wait (3m) plus margin. The
+// tier list spans both provisionTimeout branches plus unknown-tier fallback so
+// a future tier addition can't silently reintroduce an undercutting budget.
+func TestProvisionTimeouts_NeverUndercutDedicatedReadyWait(t *testing.T) {
+	floor := dedicatedProvisionReadyWait + provisionRPCDeadlineMargin
+	for _, tier := range []string{"anonymous", "free", "hobby", "hobby_plus", "pro", "growth", "team", "some-future-tier"} {
+		if got := provisionTimeout(tier); got < floor {
+			t.Errorf("provisionTimeout(%q) = %s; must be >= %s (dedicated ready-wait + margin)", tier, got, floor)
+		}
+	}
+	if cacheProvisionTimeout < floor {
+		t.Errorf("cacheProvisionTimeout = %s; must be >= %s (dedicated ready-wait + margin)", cacheProvisionTimeout, floor)
+	}
+}
+
 // TestProvisionCache_HangBecomesDeadlineError proves the defensive fix: when
 // the provisioner hangs, ProvisionCache's own client-side deadline fires and
 // returns a DeadlineExceeded-class error in ~the bounded window — NOT an
 // indefinite block. This is the error the cache handler converts into a 503
-// provision_failed after soft-deleting the pending resource.
+// provision_failed after marking the pending resource 'failed'.
 //
 // We shrink cacheProvisionTimeout to a few ms (restored via t.Cleanup) so the
 // test is fast, and give the *caller* context a generous timeout so the error
diff --git a/internal/router/router.go b/internal/router/router.go
index 6a82389f..9aa646d1 100644
--- a/internal/router/router.go
+++ b/internal/router/router.go
@@ -1050,7 +1050,7 @@ func NewWithHooks(cfg *config.Config, db *sql.DB, rdb *redis.Client, geoDbs *mid
 	api.Delete("/resources/:id",
 		middleware.RequireEnvAccess(middleware.EnvPolicyActionDeleteResource,
 			middleware.WithEnvLookup(func(c *fiber.Ctx) (string, error) {
-				return handlers.ResourceEnvByTokenForMiddleware(c, db)
+				return handlers.ResourceEnvByTokenOrIDForMiddleware(c, db)
 			}),
 		),
 		resourceH.Delete,
diff --git a/openapi.snapshot.json b/openapi.snapshot.json
index 988b50ee..e7866fa1 100644
--- a/openapi.snapshot.json
+++ b/openapi.snapshot.json
@@ -4581,6 +4581,7 @@
       "delete": {
         "parameters": [
           {
+            "description": "Accepts EITHER the resource's 'id' (as returned by GET /api/v1/resources) OR its provision 'token'. Resolution tries token first, then id; authorization (team ownership) is identical for both forms.",
             "in": "path",
             "name": "id",
             "required": true,
@@ -4596,6 +4597,9 @@
           },
           "403": {
             "description": "Forbidden — not your resource OR blocked by team env_policy. The env_policy variant carries body: { error: 'env_policy_denied', env, action, role, allowed_roles, agent_action }."
+          },
+          "404": {
+            "description": "Not found — no resource with that id or token, or it belongs to another team."
           }
         },
         "security": [

From 2be5e70bd3d837efede4e6fc4a3fba41cba51f60 Mon Sep 17 00:00:00 2001
From: Manas Srivastava <mastermanas805@gmail.com>
Date: Thu, 11 Jun 2026 01:19:10 +0530
Subject: [PATCH 2/2] test(router): cover the DELETE /resources/:id env-lookup
 wiring
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Wave-2 A1 rename of the WithEnvLookup closure target
(ResourceEnvByTokenOrIDForMiddleware) made router.go's closure line a
changed line under diff-cover, and it had no unit coverage (the route was
only exercised by build-tagged E2E). New router-package test drives an
authenticated DELETE by ROW ID through the real router — covering the
closure, the id-fallback resolution, and the idempotent-retry path.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 .../router/resource_delete_env_lookup_test.go | 77 +++++++++++++++++++
 1 file changed, 77 insertions(+)
 create mode 100644 internal/router/resource_delete_env_lookup_test.go

diff --git a/internal/router/resource_delete_env_lookup_test.go b/internal/router/resource_delete_env_lookup_test.go
new file mode 100644
index 00000000..028ffe41
--- /dev/null
+++ b/internal/router/resource_delete_env_lookup_test.go
@@ -0,0 +1,77 @@
+package router_test
+
+// resource_delete_env_lookup_test.go — drives the DELETE /api/v1/resources/:id
+// route through the REAL router so the WithEnvLookup closure wired in
+// router.go (the `return handlers.ResourceEnvByTokenOrIDForMiddleware(c, db)`
+// line) executes under unit coverage, not just E2E. Wave-2 A1 renamed that
+// helper (token-or-id resolution); this test pins the wiring:
+//
+//   - an authenticated DELETE addressed by the resource's ROW ID resolves
+//     through the env-lookup (id fallback) and soft-deletes the row, and
+//   - a second DELETE on the same (now-deleted) row reports idempotent
+//     success — proving the closure's fail-open path also routes through.
+
+import (
+	"context"
+	"net/http"
+	"net/http/httptest"
+	"testing"
+
+	"github.com/stretchr/testify/require"
+
+	"instant.dev/internal/email"
+	"instant.dev/internal/plans"
+	"instant.dev/internal/router"
+	"instant.dev/internal/testhelpers"
+)
+
+func TestRouter_ResourceDelete_ByRowID_EnvLookupWired(t *testing.T) {
+	db, dbClean := testhelpers.SetupTestDB(t)
+	defer dbClean()
+	rdb, rdbClean := testhelpers.SetupTestRedis(t)
+	defer rdbClean()
+
+	cfg := newRouterTestConfig()
+	app := router.New(cfg, db, rdb, nil, email.NewNoop(), plans.Default(), nil, nil)
+
+	// Seed a team + user + an active resource owned by the team.
+	teamID := testhelpers.MustCreateTeamDB(t, db, "pro")
+	var userID string
+	require.NoError(t, db.QueryRowContext(context.Background(),
+		`INSERT INTO users (team_id, email) VALUES ($1::uuid, $2) RETURNING id::text`,
+		teamID, testhelpers.UniqueEmail(t)).Scan(&userID))
+	jwt := testhelpers.MustSignSessionJWT(t, userID, teamID, "router-del@example.com")
+
+	var rowID string
+	require.NoError(t, db.QueryRowContext(context.Background(),
+		`INSERT INTO resources (team_id, resource_type, tier, env, status)
+		 VALUES ($1::uuid, 'webhook', 'pro', 'staging', 'active')
+		 RETURNING id::text`, teamID).Scan(&rowID))
+
+	doDelete := func() *http.Response {
+		req := httptest.NewRequest(http.MethodDelete, "/api/v1/resources/"+rowID, nil)
+		req.Header.Set("Authorization", "Bearer "+jwt)
+		resp, err := app.Test(req, 15000)
+		require.NoError(t, err)
+		return resp
+	}
+
+	// DELETE by ROW ID through the real router: env-policy middleware runs
+	// the token-or-id env lookup, then the handler resolves + deletes.
+	resp := doDelete()
+	defer resp.Body.Close()
+	require.Equal(t, http.StatusOK, resp.StatusCode,
+		"DELETE by row id through the real router must succeed")
+
+	var status string
+	require.NoError(t, db.QueryRowContext(context.Background(),
+		`SELECT status FROM resources WHERE id = $1::uuid`, rowID).Scan(&status))
+	require.Equal(t, "deleted", status)
+
+	// Idempotent retry: the row is now 'deleted'; the env lookup still runs
+	// (resolves the deleted row) and the handler reports already_deleted.
+	resp2 := doDelete()
+	defer resp2.Body.Close()
+	require.Equal(t, http.StatusOK, resp2.StatusCode,
+		"repeat DELETE must be idempotent success")
+}