From 9d1cdff2a3ee2f959c149a7c87321e681ede03c9 Mon Sep 17 00:00:00 2001 From: Manas Srivastava Date: Thu, 11 Jun 2026 01:05:23 +0530 Subject: [PATCH 1/2] fix(resources): delete-by-id, failed provision rows, RPC-deadline floor (Wave-2 A1) Three related durability fixes from the cross-interface sweep: 1. DELETE /api/v1/resources/:id accepts BOTH the row id (as returned by GET /api/v1/resources) and the provision token. Resolution tries token first (historical contract preserved byte-for-byte), then falls back to id (resolveResourceByTokenOrID). The natural list -> delete-by-id flow 404'd 100% of the time before. Authorization is identical for both forms (team-scoped, 404-not-403). The deprovision RPC / cache key / IAM audit are now always addressed by resource.Token, never the raw path param. The env-policy middleware lookup (ResourceEnvByTokenOrIDForMiddleware) got the same fallback so the id-addressed form can't skip env_policy enforcement. openapi.go documents the dual-form param. 2. Provision rollback persists a status='failed' row instead of soft-deleting it, so a failed/timed-out provision is a pollable terminal state. models.MarkResourceFailed (atomic pending->failed, mirror of MarkResourceActive) replaces models.SoftDeleteResource at all 16 rollback sites (db/cache/nosql/queue/vector/storage x anon/auth/twin + finalizeProvision). Failed rows are visible in lists (only 'deleted' is hidden), deletable, quota-exempt (CountActiveResourcesByTeamAndType filters status='active'), and invisible to the reconciler ('pending') and TTL reaper (ReapableStatuses) - no live infra backs them. Migration 070 widens resources_status_check with 'failed'; 024/049/057 re-add the same canonical set (forward-consistency: the runner re-applies every migration on boot - 2026-06-10 incident class). SoftDeleteResource itself is removed (dead after this change). 3. Provision RPC deadlines can no longer undercut the provisioner's dedicated pod-ready wait. cacheProvisionTimeout was a 45s literal while the provisioner's k8s backends wait up to 3m for pod readiness (redisK8sReadyTO et al.) - a cold-pod Redis provision was killed mid-flight. New named constants dedicatedProvisionReadyWait (3m) + provisionRPCDeadlineMargin (30s) derive the deadline (3m30s); TestProvisionTimeouts_NeverUndercutDedicatedReadyWait enforces the floor for every tier and every provision deadline in the package. Co-Authored-By: Claude Fable 5 --- docs/openapi.yaml | 12 +- .../024_resources_paused_status.sql | 11 +- .../049_resources_suspended_status.sql | 7 +- .../057_resources_pending_status.sql | 5 +- .../070_resources_failed_status.sql | 51 ++++ internal/db/postgres_migrations_test.go | 2 +- internal/handlers/cache.go | 9 +- .../coverage_provisioner_grpc_test.go | 14 +- internal/handlers/db.go | 6 +- internal/handlers/env_policy_helpers.go | 21 +- internal/handlers/env_policy_test.go | 2 +- internal/handlers/finalize_provision_test.go | 14 +- internal/handlers/nosql.go | 9 +- internal/handlers/openapi.go | 5 +- internal/handlers/provision_helper.go | 14 +- .../provision_softdelete_final2_test.go | 29 +- internal/handlers/queue.go | 7 +- internal/handlers/resource.go | 36 ++- .../handlers/resource_delete_by_id_test.go | 271 ++++++++++++++++++ .../handlers/resource_env_mw_final3_test.go | 15 +- .../resourcestatus_conversion_test.go | 2 +- internal/handlers/storage.go | 4 +- internal/handlers/vector.go | 4 +- internal/models/coverage_resource_test.go | 27 +- internal/models/resource.go | 49 +++- internal/provisioner/client.go | 51 ++-- internal/provisioner/client_cov_test.go | 44 ++- internal/router/router.go | 2 +- openapi.snapshot.json | 4 + 29 files changed, 603 insertions(+), 124 deletions(-) create mode 100644 internal/db/migrations/070_resources_failed_status.sql create mode 100644 internal/handlers/resource_delete_by_id_test.go diff --git a/docs/openapi.yaml b/docs/openapi.yaml index c215e126..70346445 100644 --- a/docs/openapi.yaml +++ b/docs/openapi.yaml @@ -538,7 +538,17 @@ paths: security: - BearerAuth: [] parameters: - - $ref: "#/components/parameters/id" + - name: id + in: path + required: true + description: >- + Accepts EITHER the resource's `id` (as returned by + GET /api/v1/resources) OR its provision `token`. Resolution tries + token first, then id; authorization (team ownership) is identical + for both forms. + schema: + type: string + format: uuid responses: "200": description: Resource deleted. diff --git a/internal/db/migrations/024_resources_paused_status.sql b/internal/db/migrations/024_resources_paused_status.sql index fa1bfe97..921b3d1e 100644 --- a/internal/db/migrations/024_resources_paused_status.sql +++ b/internal/db/migrations/024_resources_paused_status.sql @@ -29,11 +29,12 @@ ALTER TABLE resources ADD CONSTRAINT resources_status_check -- Forward-consistent full status set (incident 2026-06-10). The migration -- runner RE-APPLIES every migration on each boot; a NARROW constraint here - -- (missing 'suspended' [added in 049] / 'pending' [added in 057]) crashes - -- the boot the moment a row already holds one of those later-added — but - -- valid — statuses. Re-adding the canonical set makes 024 safe to re-run - -- regardless of data. (024/049/057 now all define the same set.) - CHECK (status IN ('pending', 'active', 'paused', 'suspended', 'expired', 'deleted', 'reaped')); + -- (missing 'suspended' [added in 049] / 'pending' [added in 057] / + -- 'failed' [added in 070]) crashes the boot the moment a row already holds + -- one of those later-added — but valid — statuses. Re-adding the canonical + -- set makes 024 safe to re-run regardless of data. (024/049/057/070 now + -- all define the same set.) + CHECK (status IN ('pending', 'active', 'paused', 'suspended', 'failed', 'expired', 'deleted', 'reaped')); ALTER TABLE resources ADD COLUMN IF NOT EXISTS paused_at TIMESTAMPTZ; diff --git a/internal/db/migrations/049_resources_suspended_status.sql b/internal/db/migrations/049_resources_suspended_status.sql index 36f77755..d23e0f3a 100644 --- a/internal/db/migrations/049_resources_suspended_status.sql +++ b/internal/db/migrations/049_resources_suspended_status.sql @@ -31,9 +31,10 @@ ALTER TABLE resources DROP CONSTRAINT IF EXISTS resources_status_check; ALTER TABLE resources ADD CONSTRAINT resources_status_check -- Forward-consistent full status set (incident 2026-06-10): include 'pending' - -- (added in 057) so re-applying 049 on boot can't crash on a valid pending - -- row before 057 runs. 024/049/057 now all define the same canonical set. - CHECK (status IN ('pending', 'active', 'paused', 'suspended', 'expired', 'deleted', 'reaped')); + -- (added in 057) and 'failed' (added in 070) so re-applying 049 on boot + -- can't crash on a valid later-added row before its own migration runs. + -- 024/049/057/070 now all define the same canonical set. + CHECK (status IN ('pending', 'active', 'paused', 'suspended', 'failed', 'expired', 'deleted', 'reaped')); -- Partial index for the auto-unsuspend scan. -- EnforceStorageQuotaWorker scans WHERE status = 'suspended' on every run to diff --git a/internal/db/migrations/057_resources_pending_status.sql b/internal/db/migrations/057_resources_pending_status.sql index 51d30016..d8a55ac9 100644 --- a/internal/db/migrations/057_resources_pending_status.sql +++ b/internal/db/migrations/057_resources_pending_status.sql @@ -42,7 +42,10 @@ ALTER TABLE resources DROP CONSTRAINT IF EXISTS resources_status_check; ALTER TABLE resources ADD CONSTRAINT resources_status_check - CHECK (status IN ('pending', 'active', 'paused', 'suspended', 'expired', 'deleted', 'reaped')); + -- Forward-consistent full status set (incident 2026-06-10): include 'failed' + -- (added in 070) so re-applying 057 on boot can't crash on a valid failed + -- row before 070 runs. 024/049/057/070 now all define the same canonical set. + CHECK (status IN ('pending', 'active', 'paused', 'suspended', 'failed', 'expired', 'deleted', 'reaped')); -- idx_resources_pending_sweep (the partial index the reconciler scans) was -- already created by migration 030_resource_heartbeat.sql — it indexes diff --git a/internal/db/migrations/070_resources_failed_status.sql b/internal/db/migrations/070_resources_failed_status.sql new file mode 100644 index 00000000..2d398b4e --- /dev/null +++ b/internal/db/migrations/070_resources_failed_status.sql @@ -0,0 +1,51 @@ +-- Migration: 070_resources_failed_status +-- +-- Add 'failed' as a permitted value in the resources.status CHECK constraint. +-- +-- Background (Wave-2 A1, cross-interface durability sweep 2026-06-11): +-- When the synchronous backend provision RPC fails or times out, every +-- provision handler used to ROLL BACK the just-created 'pending' row by +-- soft-deleting it (status='deleted'). From the caller's point of view the +-- resource simply VANISHED: a timed-out /db/new (or /cache/new, /nosql/new, +-- /queue/new, /vector/new, /storage/new, twin provision) returned 503 and +-- any follow-up GET on the token 404'd because the public read surface +-- hides deleted rows. Agents polling for an in-flight provision had no +-- terminal state to observe. +-- +-- The handlers now mark the row status='failed' instead +-- (models.MarkResourceFailed, pending→failed). A failed row is: +-- * visible in GET /api/v1/resources (lists filter only status='deleted') +-- * deletable via DELETE /api/v1/resources/:id (the status!='deleted' +-- guard in SoftDeleteResourceIfActive admits it) +-- * NOT counted toward plan quotas (quota counts filter status='active') +-- * never swept by the provisioner_reconciler (it keys on 'pending') or +-- the TTL reaper (ReapableStatuses() = active/paused/suspended) — the +-- backend object either never existed or was already torn down by the +-- rollback's best-effort cleanup, so there is nothing to deprovision. +-- +-- Without this migration every MarkResourceFailed UPDATE would hit +-- constraint-violation 23514 and the rollback path itself would error. +-- +-- Status semantics (updated): +-- pending — row inserted, backend provision RPC + URL persistence not yet +-- complete; the transient mid-provision state. NOT usable. +-- active — provisioned, accepting connections (or status-only for queue/storage/webhook) +-- paused — user-initiated pause (Pro+ only); infra revoked; data preserved +-- suspended — system-initiated suspend on storage quota breach; infra revoked +-- failed — terminal: the backend provision RPC failed/timed out and the +-- rollback kept the row as a pollable terminal state. No live +-- backing infra. Visible in lists; deletable; quota-exempt. +-- expired — TTL reached (anonymous resources); soft-deleted equivalent for anon +-- deleted — user-deleted (permanent credentials removed) +-- reaped — legacy: worker-reaped before 'deleted' was the canonical term +-- +-- Idempotent: DROP IF EXISTS + re-ADD with the same syntax, so re-running on a +-- schema that already applied this migration is harmless. Migrations 024/049/057 +-- were updated in the same change to define this same canonical set (the +-- migration runner RE-APPLIES every migration on each boot — see the +-- forward-consistency note in 024, incident 2026-06-10). + +ALTER TABLE resources DROP CONSTRAINT IF EXISTS resources_status_check; +ALTER TABLE resources + ADD CONSTRAINT resources_status_check + CHECK (status IN ('pending', 'active', 'paused', 'suspended', 'failed', 'expired', 'deleted', 'reaped')); diff --git a/internal/db/postgres_migrations_test.go b/internal/db/postgres_migrations_test.go index cfb7e9f8..a2a1bb95 100644 --- a/internal/db/postgres_migrations_test.go +++ b/internal/db/postgres_migrations_test.go @@ -511,7 +511,7 @@ func TestConnectRedis_PanicsOnUnreachable(t *testing.T) { // re-applying it mid-sequence can never reject a valid row. No DB needed — // reads the embedded SQL via the same seam the runner uses. func TestMigrations_ResourcesStatusCheck_ForwardConsistent(t *testing.T) { - canonical := []string{"pending", "active", "paused", "suspended", "expired", "deleted", "reaped"} + canonical := []string{"pending", "active", "paused", "suspended", "failed", "expired", "deleted", "reaped"} checked := 0 for _, name := range MigrationFiles() { b, err := readMigrationFile(name) diff --git a/internal/handlers/cache.go b/internal/handlers/cache.go index 1248d184..3a403763 100644 --- a/internal/handlers/cache.go +++ b/internal/handlers/cache.go @@ -221,8 +221,9 @@ func (h *CacheHandler) NewCache(c *fiber.Ctx) error { middleware.RecordProvisionFail("redis", middleware.ProvisionFailBackendUnavailable) slog.Error("cache.new.provision_failed", "error", err, "token", tokenStr, "request_id", requestID) - // Soft-delete the resource record so limits aren't falsely consumed. - if delErr := models.SoftDeleteResource(ctx, h.db, resource.ID); delErr != nil { + // Mark the pending row 'failed' — a pollable terminal state. Failed + // rows never count against quota (counts filter status='active'). + if delErr := models.MarkResourceFailed(ctx, h.db, resource.ID); delErr != nil { slog.Error("cache.new.soft_delete_failed", "error", delErr, "resource_id", resource.ID) } return respondProvisionFailed(c, err, "Failed to provision Redis namespace") @@ -387,7 +388,7 @@ func (h *CacheHandler) newCacheAuthenticated( middleware.RecordProvisionFail("redis", middleware.ProvisionFailBackendUnavailable) slog.Error("cache.new.provision_failed_auth", "error", err, "token", tokenStr, "team_id", teamIDStr, "request_id", requestID) - if delErr := models.SoftDeleteResource(ctx, h.db, resource.ID); delErr != nil { + if delErr := models.MarkResourceFailed(ctx, h.db, resource.ID); delErr != nil { slog.Error("cache.new.soft_delete_failed_auth", "error", delErr, "resource_id", resource.ID) } return respondProvisionFailed(c, err, "Failed to provision Redis namespace") @@ -563,7 +564,7 @@ func (h *CacheHandler) ProvisionForTwinCore(ctx context.Context, in ProvisionFor middleware.RecordProvisionFail(models.ResourceTypeRedis, middleware.ProvisionFailBackendUnavailable) slog.Error("twin.cache.provision_failed", "error", err, "token", tokenStr, "team_id", in.TeamID, "request_id", in.RequestID) - if delErr := models.SoftDeleteResource(ctx, h.db, resource.ID); delErr != nil { + if delErr := models.MarkResourceFailed(ctx, h.db, resource.ID); delErr != nil { slog.Error("twin.cache.soft_delete_failed", "error", delErr, "resource_id", resource.ID, "request_id", in.RequestID) } diff --git a/internal/handlers/coverage_provisioner_grpc_test.go b/internal/handlers/coverage_provisioner_grpc_test.go index 43062eb7..c157e260 100644 --- a/internal/handlers/coverage_provisioner_grpc_test.go +++ b/internal/handlers/coverage_provisioner_grpc_test.go @@ -67,7 +67,8 @@ type fakeProvisioner struct { failProvision bool deprovisionCalls int - lastReq *provisionerv1.ProvisionRequest + lastReq *provisionerv1.ProvisionRequest + lastDeprovisionReq *provisionerv1.DeprovisionRequest } func (f *fakeProvisioner) ProvisionResource(_ context.Context, req *provisionerv1.ProvisionRequest) (*provisionerv1.ProvisionResponse, error) { @@ -113,13 +114,21 @@ func (f *fakeProvisioner) ProvisionResource(_ context.Context, req *provisionerv } } -func (f *fakeProvisioner) DeprovisionResource(_ context.Context, _ *provisionerv1.DeprovisionRequest) (*provisionerv1.DeprovisionResponse, error) { +func (f *fakeProvisioner) DeprovisionResource(_ context.Context, req *provisionerv1.DeprovisionRequest) (*provisionerv1.DeprovisionResponse, error) { f.mu.Lock() f.deprovisionCalls++ + f.lastDeprovisionReq = req f.mu.Unlock() return &provisionerv1.DeprovisionResponse{}, nil } +// lastDeprovision returns the most recent DeprovisionRequest (nil if none). +func (f *fakeProvisioner) lastDeprovision() *provisionerv1.DeprovisionRequest { + f.mu.Lock() + defer f.mu.Unlock() + return f.lastDeprovisionReq +} + func (f *fakeProvisioner) deprovisionCount() int { f.mu.Lock() defer f.mu.Unlock() @@ -231,6 +240,7 @@ func setupGRPCProvFixture(t *testing.T, fake *fakeProvisioner, badAESKey bool) g middleware.SetRoleLookupDB(db) api := app.Group("/api/v1", middleware.RequireAuth(cfg), middleware.PopulateTeamRole()) + api.Get("/resources", resourceH.List) api.Get("/resources/:id", resourceH.Get) api.Delete("/resources/:id", resourceH.Delete) api.Post("/resources/:id/provision-twin", twinH.ProvisionTwin) diff --git a/internal/handlers/db.go b/internal/handlers/db.go index 583ef4cf..728ae77c 100644 --- a/internal/handlers/db.go +++ b/internal/handlers/db.go @@ -260,7 +260,7 @@ func (h *DBHandler) NewDB(c *fiber.Ctx) error { middleware.RecordProvisionFail("postgres", middleware.ProvisionFailBackendUnavailable) slog.Error("db.new.provision_failed", "error", err, "token", tokenStr, "request_id", requestID) - if delErr := models.SoftDeleteResource(ctx, h.db, resource.ID); delErr != nil { + if delErr := models.MarkResourceFailed(ctx, h.db, resource.ID); delErr != nil { slog.Error("db.new.soft_delete_failed", "error", delErr, "resource_id", resource.ID) } return respondProvisionFailed(c, err, "Failed to provision Postgres database") @@ -438,7 +438,7 @@ func (h *DBHandler) newDBAuthenticated( middleware.RecordProvisionFail("postgres", middleware.ProvisionFailBackendUnavailable) slog.Error("db.new.provision_failed_auth", "error", err, "token", tokenStr, "team_id", teamIDStr, "request_id", requestID) - if delErr := models.SoftDeleteResource(ctx, h.db, resource.ID); delErr != nil { + if delErr := models.MarkResourceFailed(ctx, h.db, resource.ID); delErr != nil { slog.Error("db.new.soft_delete_failed_auth", "error", delErr, "resource_id", resource.ID) } return respondProvisionFailed(c, err, "Failed to provision Postgres database") @@ -647,7 +647,7 @@ func (h *DBHandler) ProvisionForTwinCore(ctx context.Context, in ProvisionForTwi middleware.RecordProvisionFail(models.ResourceTypePostgres, middleware.ProvisionFailBackendUnavailable) slog.Error("twin.db.provision_failed", "error", err, "token", tokenStr, "team_id", in.TeamID, "request_id", in.RequestID) - if delErr := models.SoftDeleteResource(ctx, h.db, resource.ID); delErr != nil { + if delErr := models.MarkResourceFailed(ctx, h.db, resource.ID); delErr != nil { slog.Error("twin.db.soft_delete_failed", "error", delErr, "resource_id", resource.ID, "request_id", in.RequestID) } diff --git a/internal/handlers/env_policy_helpers.go b/internal/handlers/env_policy_helpers.go index 7d48549f..55329576 100644 --- a/internal/handlers/env_policy_helpers.go +++ b/internal/handlers/env_policy_helpers.go @@ -14,24 +14,27 @@ import ( "github.com/gofiber/fiber/v2" "github.com/google/uuid" - "instant.dev/internal/models" ) -// ResourceEnvByTokenForMiddleware reads the env stored on a resource row -// addressed by the URL :id param (a public token UUID). Returns the env on -// success or "" on any error — the env-policy middleware fails OPEN on -// lookup error so a malformed/non-existent :id falls through to the -// handler's own 400/404 instead of a confusing 403/env_policy_denied. +// ResourceEnvByTokenOrIDForMiddleware reads the env stored on a resource row +// addressed by the URL :id param — resolved first as a public token UUID, +// then as the row's primary-key id (resolveResourceByTokenOrID), matching +// the DELETE handler's own resolution so the env-policy gate covers BOTH +// address forms (without the id fallback an id-addressed DELETE would skip +// env-policy enforcement entirely). Returns the env on success or "" on any +// error — the env-policy middleware fails OPEN on lookup error so a +// malformed/non-existent :id falls through to the handler's own 400/404 +// instead of a confusing 403/env_policy_denied. // // Exported with the verbose suffix so its single intended caller (the // router wiring) is unambiguous; this is not a general-purpose helper. -func ResourceEnvByTokenForMiddleware(c *fiber.Ctx, db *sql.DB) (string, error) { +func ResourceEnvByTokenOrIDForMiddleware(c *fiber.Ctx, db *sql.DB) (string, error) { tokenStr := c.Params("id") - token, err := uuid.Parse(tokenStr) + pathUUID, err := uuid.Parse(tokenStr) if err != nil { return "", nil } - r, err := models.GetResourceByToken(c.Context(), db, token) + r, err := resolveResourceByTokenOrID(c.Context(), db, pathUUID) if err != nil { // Including ErrResourceNotFound — fail open so the handler returns // its own 404 (which contains a stable, agent-readable shape). diff --git a/internal/handlers/env_policy_test.go b/internal/handlers/env_policy_test.go index c789ecc8..0f130e20 100644 --- a/internal/handlers/env_policy_test.go +++ b/internal/handlers/env_policy_test.go @@ -73,7 +73,7 @@ func envPolicyApp(t *testing.T, db *sql.DB) *fiber.App { api.Delete("/resources/:id", middleware.RequireEnvAccess(middleware.EnvPolicyActionDeleteResource, middleware.WithEnvLookup(func(c *fiber.Ctx) (string, error) { - return handlers.ResourceEnvByTokenForMiddleware(c, db) + return handlers.ResourceEnvByTokenOrIDForMiddleware(c, db) }), ), func(c *fiber.Ctx) error { return c.JSON(fiber.Map{"ok": true}) }, diff --git a/internal/handlers/finalize_provision_test.go b/internal/handlers/finalize_provision_test.go index 7041bd61..28b95754 100644 --- a/internal/handlers/finalize_provision_test.go +++ b/internal/handlers/finalize_provision_test.go @@ -77,16 +77,18 @@ func TestFinalizeProvision_PersistenceFailure_ReturnsErrorAndRunsCleanup(t *test "finalizeProvision must run the cleanup closure on persistence failure to tear down "+ "the just-provisioned backend object; otherwise the platform leaks an orphan") - // 3. Row is soft-deleted (status='deleted'), NOT left at 'pending' or - // 'active'. A pending row would be picked up by the reconciler; an - // active row would falsely advertise itself as usable in dashboard - // listings and quota counts. + // 3. Row is marked failed (status='failed'), NOT left at 'pending' or + // 'active', and NOT soft-deleted. A pending row would be picked up by + // the reconciler; an active row would falsely advertise itself as + // usable in dashboard listings and quota counts; a deleted row would + // VANISH from the caller's read surface (the pre-Wave-2-A1 behaviour) + // leaving no pollable terminal state. var status string require.NoError(t, dbConn.QueryRow( `SELECT status FROM resources WHERE id = $1`, res.ID, ).Scan(&status)) - assert.Equal(t, "deleted", status, - "on a persistence failure the row must be soft-deleted so it doesn't leak as an orphan") + assert.Equal(t, models.StatusFailed, status, + "on a persistence failure the row must be marked 'failed' — a pollable terminal state") } // TestFinalizeProvision_Success_FlipsToActive is the happy-path guard: diff --git a/internal/handlers/nosql.go b/internal/handlers/nosql.go index fb4bccd6..857ad4bd 100644 --- a/internal/handlers/nosql.go +++ b/internal/handlers/nosql.go @@ -217,8 +217,9 @@ func (h *NoSQLHandler) NewNoSQL(c *fiber.Ctx) error { middleware.RecordProvisionFail("mongodb", middleware.ProvisionFailBackendUnavailable) slog.Error("nosql.new.provision_failed", "error", err, "token", tokenStr, "request_id", requestID) - // Soft-delete the resource record so limits aren't falsely consumed. - if delErr := models.SoftDeleteResource(ctx, h.db, resource.ID); delErr != nil { + // Mark the pending row 'failed' — a pollable terminal state. Failed + // rows never count against quota (counts filter status='active'). + if delErr := models.MarkResourceFailed(ctx, h.db, resource.ID); delErr != nil { slog.Error("nosql.new.soft_delete_failed", "error", delErr, "resource_id", resource.ID) } return respondProvisionFailed(c, err, "Failed to provision MongoDB database") @@ -379,7 +380,7 @@ func (h *NoSQLHandler) newNoSQLAuthenticated( middleware.RecordProvisionFail("mongodb", middleware.ProvisionFailBackendUnavailable) slog.Error("nosql.new.provision_failed_auth", "error", err, "token", tokenStr, "team_id", teamIDStr, "request_id", requestID) - if delErr := models.SoftDeleteResource(ctx, h.db, resource.ID); delErr != nil { + if delErr := models.MarkResourceFailed(ctx, h.db, resource.ID); delErr != nil { slog.Error("nosql.new.soft_delete_failed_auth", "error", delErr, "resource_id", resource.ID) } return respondProvisionFailed(c, err, "Failed to provision MongoDB database") @@ -567,7 +568,7 @@ func (h *NoSQLHandler) ProvisionForTwinCore(ctx context.Context, in ProvisionFor middleware.RecordProvisionFail(models.ResourceTypeMongoDB, middleware.ProvisionFailBackendUnavailable) slog.Error("twin.nosql.provision_failed", "error", err, "token", tokenStr, "team_id", in.TeamID, "request_id", in.RequestID) - if delErr := models.SoftDeleteResource(ctx, h.db, resource.ID); delErr != nil { + if delErr := models.MarkResourceFailed(ctx, h.db, resource.ID); delErr != nil { slog.Error("twin.nosql.soft_delete_failed", "error", delErr, "resource_id", resource.ID, "request_id", in.RequestID) } diff --git a/internal/handlers/openapi.go b/internal/handlers/openapi.go index 6e1fdd95..c182b729 100644 --- a/internal/handlers/openapi.go +++ b/internal/handlers/openapi.go @@ -882,10 +882,11 @@ const openAPISpec = `{ "delete": { "summary": "Delete a resource", "security": [{ "bearerAuth": [] }], - "parameters": [{ "name": "id", "in": "path", "required": true, "schema": { "type": "string", "format": "uuid" } }], + "parameters": [{ "name": "id", "in": "path", "required": true, "schema": { "type": "string", "format": "uuid" }, "description": "Accepts EITHER the resource's 'id' (as returned by GET /api/v1/resources) OR its provision 'token'. Resolution tries token first, then id; authorization (team ownership) is identical for both forms." }], "responses": { "200": { "description": "Resource deleted" }, - "403": { "description": "Forbidden — not your resource OR blocked by team env_policy. The env_policy variant carries body: { error: 'env_policy_denied', env, action, role, allowed_roles, agent_action }." } + "403": { "description": "Forbidden — not your resource OR blocked by team env_policy. The env_policy variant carries body: { error: 'env_policy_denied', env, action, role, allowed_roles, agent_action }." }, + "404": { "description": "Not found — no resource with that id or token, or it belongs to another team." } } } }, diff --git a/internal/handlers/provision_helper.go b/internal/handlers/provision_helper.go index acc8bc07..1697f29a 100644 --- a/internal/handlers/provision_helper.go +++ b/internal/handlers/provision_helper.go @@ -430,7 +430,9 @@ var errProvisionPersistFailed = errors.New("provision persistence failed") // this helper: // // - runs the caller-supplied cleanup closure (best-effort backend deprovision), -// - soft-deletes the resource row, +// - marks the resource row 'failed' (a pollable terminal state — see +// models.MarkResourceFailed; failed rows are list-visible, deletable, +// and quota-exempt), // - returns errProvisionPersistFailed. // // The caller treats a non-nil return as a hard provision failure @@ -505,12 +507,14 @@ func (h *provisionHelper) finalizeProvision( } // Persistence failed — the resource is unreachable / un-addressable. Tear - // down the backend object (best-effort) and soft-delete the row so the - // platform is not left billing an orphan, then signal a hard failure. + // down the backend object (best-effort) and mark the row 'failed' so the + // caller has a pollable terminal state and the platform is not left + // billing an orphan (quota counts filter status='active'), then signal a + // hard failure. if cleanup != nil { cleanup() } - if delErr := models.SoftDeleteResource(ctx, h.db, resource.ID); delErr != nil { + if delErr := models.MarkResourceFailed(ctx, h.db, resource.ID); delErr != nil { slog.Error(logPrefix+".cleanup_soft_delete_failed", "error", delErr, "resource_id", resource.ID, "request_id", requestID) } @@ -518,7 +522,7 @@ func (h *provisionHelper) finalizeProvision( // MR-P0-3: emit the operator-alert audit row. This is the moment the // platform produced an unreachable resource — the backend object existed // (briefly), the platform DB could not address it, and the deprovision + - // soft-delete is the compensation. Operators key on this kind to + // 'failed' mark is the compensation. Operators key on this kind to // reconstruct the exact request, find the upstream failure cause // (DB unreachable / encrypt failure / etc.), and audit-trace any backend // objects that escaped the best-effort cleanup. Best-effort emit: audit- diff --git a/internal/handlers/provision_softdelete_final2_test.go b/internal/handlers/provision_softdelete_final2_test.go index 79f6285f..cc5b27b3 100644 --- a/internal/handlers/provision_softdelete_final2_test.go +++ b/internal/handlers/provision_softdelete_final2_test.go @@ -19,6 +19,7 @@ package handlers_test // caps). A unique IP per call keeps the rate-limit/fingerprint counters clean. import ( + "context" "database/sql" "encoding/json" "errors" @@ -37,6 +38,7 @@ import ( "instant.dev/internal/config" "instant.dev/internal/handlers" "instant.dev/internal/middleware" + "instant.dev/internal/models" "instant.dev/internal/plans" "instant.dev/internal/testhelpers" ) @@ -114,7 +116,9 @@ func postBadBackend(t *testing.T, app *fiber.App, path, jwt, ip string) (int, st } // TestProvisionFinal2_BackendFailure_SoftDelete_Auth covers the authenticated -// backend-failure soft-delete arms in db.go (L424) and nosql.go. +// backend-failure rollback arms in db.go (L424) and nosql.go. Post Wave-2 A1 +// the rollback persists the row as status='failed' (a pollable terminal +// state) instead of soft-deleting it — asserted per resource type below. func TestProvisionFinal2_BackendFailure_SoftDelete_Auth(t *testing.T) { if os.Getenv("TEST_DATABASE_URL") == "" { t.Skip("TEST_DATABASE_URL not set") @@ -123,10 +127,25 @@ func TestProvisionFinal2_BackendFailure_SoftDelete_Auth(t *testing.T) { teamID := testhelpers.MustCreateTeamDB(t, liveDB, "pro") jwt := authSessionJWT(t, liveDB, teamID) - for i, path := range []string{"/db/new", "/nosql/new", "/vector/new", "/queue/new"} { - status, errCode := postBadBackend(t, app, path, jwt, "10.220."+digitStr(i)+".7") - assert.Equalf(t, http.StatusServiceUnavailable, status, "%s backend-fail must 503", path) - assert.Equalf(t, "provision_failed", errCode, "%s error code", path) + cases := []struct{ path, resourceType string }{ + {"/db/new", "postgres"}, + {"/nosql/new", "mongodb"}, + {"/vector/new", "vector"}, + {"/queue/new", "queue"}, + } + for i, tc := range cases { + status, errCode := postBadBackend(t, app, tc.path, jwt, "10.220."+digitStr(i)+".7") + assert.Equalf(t, http.StatusServiceUnavailable, status, "%s backend-fail must 503", tc.path) + assert.Equalf(t, "provision_failed", errCode, "%s error code", tc.path) + + // Wave-2 A1: the rolled-back row must persist as 'failed', not vanish. + var rowStatus string + require.NoErrorf(t, liveDB.QueryRowContext(context.Background(), + `SELECT status FROM resources WHERE team_id = $1::uuid AND resource_type = $2 + ORDER BY created_at DESC LIMIT 1`, teamID, tc.resourceType).Scan(&rowStatus), + "%s rollback row must exist", tc.path) + assert.Equalf(t, models.StatusFailed, rowStatus, + "%s rollback row must be status='failed' (pollable terminal state)", tc.path) } } diff --git a/internal/handlers/queue.go b/internal/handlers/queue.go index af4cd126..ca443f12 100644 --- a/internal/handlers/queue.go +++ b/internal/handlers/queue.go @@ -297,8 +297,9 @@ func (h *QueueHandler) NewQueue(c *fiber.Ctx) error { middleware.RecordProvisionFail("queue", middleware.ProvisionFailBackendUnavailable) slog.Error("queue.new.provision_failed", "error", err, "token", tokenStr, "request_id", requestID) - // Soft-delete the resource record so limits aren't falsely consumed. - if delErr := models.SoftDeleteResource(ctx, h.db, resource.ID); delErr != nil { + // Mark the pending row 'failed' — a pollable terminal state. Failed + // rows never count against quota (counts filter status='active'). + if delErr := models.MarkResourceFailed(ctx, h.db, resource.ID); delErr != nil { slog.Error("queue.new.soft_delete_failed", "error", delErr, "resource_id", resource.ID) } return respondProvisionFailed(c, err, "Failed to provision NATS credentials") @@ -523,7 +524,7 @@ func (h *QueueHandler) newQueueAuthenticated( middleware.RecordProvisionFail("queue", middleware.ProvisionFailBackendUnavailable) slog.Error("queue.new.provision_failed_auth", "error", err, "token", tokenStr, "team_id", teamIDStr, "request_id", requestID) - if delErr := models.SoftDeleteResource(ctx, h.db, resource.ID); delErr != nil { + if delErr := models.MarkResourceFailed(ctx, h.db, resource.ID); delErr != nil { slog.Error("queue.new.soft_delete_failed_auth", "error", delErr, "resource_id", resource.ID) } return respondProvisionFailed(c, err, "Failed to provision NATS credentials") diff --git a/internal/handlers/resource.go b/internal/handlers/resource.go index 88c58c1e..9cf9326b 100644 --- a/internal/handlers/resource.go +++ b/internal/handlers/resource.go @@ -181,7 +181,29 @@ func (h *ResourceHandler) Get(c *fiber.Ctx) error { }) } +// resolveResourceByTokenOrID resolves a path UUID first as a provision token +// (the historical DELETE contract), then — when no token matches — as the +// row's primary-key id. GET /api/v1/resources keys its items by `id`, so +// before this fallback the natural list→delete-by-id flow 404'd 100% of the +// time (Wave-2 A1). Token resolution stays first so existing token-addressed +// callers see byte-identical behaviour; a collision between one row's token +// and another row's id is a 2^-122 non-event. +func resolveResourceByTokenOrID(ctx context.Context, db *sql.DB, pathUUID uuid.UUID) (*models.Resource, error) { + resource, err := models.GetResourceByToken(ctx, db, pathUUID) + if err != nil { + var notFound *models.ErrResourceNotFound + if errors.As(err, ¬Found) { + return models.GetResourceByID(ctx, db, pathUUID) + } + } + return resource, err +} + // Delete handles DELETE /api/v1/resources/:id — soft-deletes a resource. +// :id accepts EITHER the resource's provision token (historical contract) +// OR its `id` as returned by GET /api/v1/resources (see +// resolveResourceByTokenOrID). Authorization is identical for both forms: +// the row must belong to the caller's team. func (h *ResourceHandler) Delete(c *fiber.Ctx) error { requestID := middleware.GetRequestID(c) @@ -191,12 +213,12 @@ func (h *ResourceHandler) Delete(c *fiber.Ctx) error { } tokenStr := c.Params("id") - token, parseErr := uuid.Parse(tokenStr) + pathUUID, parseErr := uuid.Parse(tokenStr) if parseErr != nil { return respondError(c, fiber.StatusBadRequest, "invalid_id", "Resource ID must be a valid UUID") } - resource, err := models.GetResourceByToken(c.Context(), h.db, token) + resource, err := resolveResourceByTokenOrID(c.Context(), h.db, pathUUID) if err != nil { var notFound *models.ErrResourceNotFound if errors.As(err, ¬Found) { @@ -210,6 +232,11 @@ func (h *ResourceHandler) Delete(c *fiber.Ctx) error { return respondError(c, fiber.StatusServiceUnavailable, "fetch_failed", "Failed to fetch resource") } + // token is the resource's TOKEN, not the raw path UUID — under the + // id-addressed form the path param is the row id, and every backend + // object name / cache key / audit summary derives from the token. + token := resource.Token + if !resource.TeamID.Valid || resource.TeamID.UUID != teamID { // 404 not 403: never confirm the existence of resources owned by // other teams (or unclaimed anonymous resources). @@ -298,8 +325,9 @@ func (h *ResourceHandler) Delete(c *fiber.Ctx) error { // Invalidate the cached resource entry so any in-flight requests // see the deletion immediately rather than waiting for TTL expiry. - // Use token.String() (normalized lowercase UUID) to match the key written by - // getResourceCached — tokenStr is the raw URL param and may be mixed-case. + // Use token.String() (normalized lowercase TOKEN UUID) to match the key + // written by getResourceCached — tokenStr is the raw URL param, may be + // mixed-case, and under the id-addressed form isn't the token at all. h.rdb.Del(c.Context(), fmt.Sprintf("res:%s", token.String())) slog.Info("resource.deleted", diff --git a/internal/handlers/resource_delete_by_id_test.go b/internal/handlers/resource_delete_by_id_test.go new file mode 100644 index 00000000..7dd9666e --- /dev/null +++ b/internal/handlers/resource_delete_by_id_test.go @@ -0,0 +1,271 @@ +package handlers_test + +// resource_delete_by_id_test.go — Wave-2 A1 behavioural coverage: +// +// 1. DELETE /api/v1/resources/:id accepts BOTH address forms — the row `id` +// (as returned by GET /api/v1/resources) and the provision `token`. The +// id form used to 404 100% of the time, breaking the natural +// list→delete-by-id flow. +// 2. Provision-rollback rows persist as status='failed' (a pollable terminal +// state): visible in lists, deletable, never counted against quota. +// +// Uses the bufconn fake-provisioner fixture from +// coverage_provisioner_grpc_test.go so the deprovision RPC is observable. + +import ( + "context" + "encoding/json" + "io" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/google/uuid" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "instant.dev/internal/handlers" + "instant.dev/internal/models" + "instant.dev/internal/plans" + "instant.dev/internal/testhelpers" +) + +// decodeInto unmarshals a response body into out (best-effort: an empty body +// leaves out zero-valued). +func decodeInto(t *testing.T, resp *http.Response, out any) { + t.Helper() + raw, err := io.ReadAll(resp.Body) + require.NoError(t, err) + if len(raw) > 0 { + require.NoError(t, json.Unmarshal(raw, out)) + } +} + +// mustParseUUID parses s or fails the test. +func mustParseUUID(t *testing.T, s string) uuid.UUID { + t.Helper() + id, err := uuid.Parse(s) + require.NoError(t, err) + return id +} + +// doDelete issues DELETE /api/v1/resources/{ref} with the given JWT. +func doDelete(t *testing.T, fx grpcProvFixture, ref, jwt string) (*http.Response, grpcProvResp) { + t.Helper() + req := httptest.NewRequest(http.MethodDelete, "/api/v1/resources/"+ref, nil) + req.Header.Set("X-Forwarded-For", "10.70.0.1") + req.Header.Set("Authorization", "Bearer "+jwt) + resp, err := fx.app.Test(req, 15000) + require.NoError(t, err) + var parsed grpcProvResp + decodeInto(t, resp, &parsed) + return resp, parsed +} + +// resourceStatus reads the row's current status. +func resourceStatus(t *testing.T, fx grpcProvFixture, id string) string { + t.Helper() + var status string + require.NoError(t, fx.db.QueryRowContext(context.Background(), + `SELECT status FROM resources WHERE id = $1::uuid`, id).Scan(&status)) + return status +} + +// TestResourceDelete_ByRowID_Succeeds is the headline fix: the id returned by +// the list endpoint is accepted by DELETE, with the deprovision RPC still +// addressed by the resource's TOKEN (backend object names derive from it). +func TestResourceDelete_ByRowID_Succeeds(t *testing.T) { + fake := &fakeProvisioner{} + fx := setupGRPCProvFixture(t, fake, false) + teamID := testhelpers.MustCreateTeamDB(t, fx.db, "pro") + jwt := authSessionJWT(t, fx.db, teamID) + id, token := seedSourceResource(t, fx.db, teamID, "postgres", "pro", "production") + + resp, _ := doDelete(t, fx, id, jwt) + defer resp.Body.Close() + require.Equal(t, http.StatusOK, resp.StatusCode, + "DELETE by row id must succeed — the list endpoint keys rows by id") + + assert.Equal(t, "deleted", resourceStatus(t, fx, id)) + + // The destructive deprovision must target the TOKEN, not the path id. + require.GreaterOrEqual(t, fake.deprovisionCount(), 1, "deprovision RPC must fire") + require.NotNil(t, fake.lastDeprovision()) + assert.Equal(t, token, fake.lastDeprovision().GetToken(), + "deprovision must be addressed by resource.Token even when the path param is the row id") +} + +// TestResourceDelete_ByToken_StillWorks pins the historical contract. +func TestResourceDelete_ByToken_StillWorks(t *testing.T) { + fake := &fakeProvisioner{} + fx := setupGRPCProvFixture(t, fake, false) + teamID := testhelpers.MustCreateTeamDB(t, fx.db, "pro") + jwt := authSessionJWT(t, fx.db, teamID) + id, token := seedSourceResource(t, fx.db, teamID, "postgres", "pro", "production") + + resp, _ := doDelete(t, fx, token, jwt) + defer resp.Body.Close() + require.Equal(t, http.StatusOK, resp.StatusCode) + assert.Equal(t, "deleted", resourceStatus(t, fx, id)) +} + +// TestResourceDelete_ByRowID_OtherTeam_404 — authorization is identical for +// the id form: another team's row id resolves but must 404 (never confirm +// existence). +func TestResourceDelete_ByRowID_OtherTeam_404(t *testing.T) { + fake := &fakeProvisioner{} + fx := setupGRPCProvFixture(t, fake, false) + ownerTeam := testhelpers.MustCreateTeamDB(t, fx.db, "pro") + id, _ := seedSourceResource(t, fx.db, ownerTeam, "postgres", "pro", "production") + + attackerTeam := testhelpers.MustCreateTeamDB(t, fx.db, "pro") + jwt := authSessionJWT(t, fx.db, attackerTeam) + + resp, body := doDelete(t, fx, id, jwt) + defer resp.Body.Close() + require.Equal(t, http.StatusNotFound, resp.StatusCode) + assert.Equal(t, "not_found", body.Error) + assert.Equal(t, "active", resourceStatus(t, fx, id), "row must be untouched") + assert.Equal(t, 0, fake.deprovisionCount(), "no deprovision for an unauthorized delete") +} + +// TestResourceDelete_UnknownUUID_404 — a UUID matching neither token nor id. +func TestResourceDelete_UnknownUUID_404(t *testing.T) { + fake := &fakeProvisioner{} + fx := setupGRPCProvFixture(t, fake, false) + teamID := testhelpers.MustCreateTeamDB(t, fx.db, "pro") + jwt := authSessionJWT(t, fx.db, teamID) + + resp, body := doDelete(t, fx, "00000000-0000-4000-8000-00000000dead", jwt) + defer resp.Body.Close() + require.Equal(t, http.StatusNotFound, resp.StatusCode) + assert.Equal(t, "not_found", body.Error) +} + +// TestProvisionRollback_PersistsFailedRow_ListedAndDeletable drives the full +// Wave-2 A1 lifecycle: a failed provision leaves a status='failed' row that +// (a) the caller can poll, (b) appears in GET /api/v1/resources, (c) does NOT +// count against the per-type quota, and (d) is deletable by row id. +func TestProvisionRollback_PersistsFailedRow_ListedAndDeletable(t *testing.T) { + fake := &fakeProvisioner{failProvision: true} + fx := setupGRPCProvFixture(t, fake, false) + teamID := testhelpers.MustCreateTeamDB(t, fx.db, "pro") + jwt := authSessionJWT(t, fx.db, teamID) + + resp, body := doProvision(t, fx, "/db/new", "10.71.0.1", jwt, map[string]any{"name": "rollback-failed-row"}) + resp.Body.Close() + require.Equal(t, http.StatusServiceUnavailable, resp.StatusCode) + require.Equal(t, "provision_failed", body.Error) + + // (a) The row persists in the terminal 'failed' state — it must NOT have + // vanished (the pre-A1 soft-delete made a timed-out provision unpollable). + var id, status string + require.NoError(t, fx.db.QueryRowContext(context.Background(), + `SELECT id::text, status FROM resources WHERE team_id = $1::uuid AND resource_type = 'postgres' + ORDER BY created_at DESC LIMIT 1`, teamID).Scan(&id, &status)) + require.Equal(t, models.StatusFailed, status, + "provision rollback must persist a 'failed' row, not delete it") + + // (b) Visible in the team list (only 'deleted' rows are hidden). + listReq := httptest.NewRequest(http.MethodGet, "/api/v1/resources", nil) + listReq.Header.Set("Authorization", "Bearer "+jwt) + listResp, err := fx.app.Test(listReq, 15000) + require.NoError(t, err) + defer listResp.Body.Close() + require.Equal(t, http.StatusOK, listResp.StatusCode) + var list struct { + Items []struct { + ID string `json:"id"` + Status string `json:"status"` + } `json:"items"` + } + decodeInto(t, listResp, &list) + found := false + for _, it := range list.Items { + if it.ID == id { + found = true + assert.Equal(t, models.StatusFailed, it.Status) + } + } + assert.True(t, found, "failed row must appear in GET /api/v1/resources") + + // (c) Quota-exempt: failed rows never count toward the per-type cap. + n, err := models.CountActiveResourcesByTeamAndType(context.Background(), fx.db, + mustParseUUID(t, teamID), "postgres") + require.NoError(t, err) + assert.Equal(t, 0, n, "a failed row must not count against the plan quota") + + // (d) Deletable by row id — terminal cleanup stays in the user's hands. + delResp, _ := doDelete(t, fx, id, jwt) + defer delResp.Body.Close() + require.Equal(t, http.StatusOK, delResp.StatusCode, "failed rows must be deletable") + assert.Equal(t, "deleted", resourceStatus(t, fx, id)) +} + +// TestProvisionRollback_Cache_Authenticated_FailedRow covers the +// authenticated /cache/new rollback arm (cache.go) — the one provision-fail +// arm the pre-A1 suites didn't reach — and asserts the new terminal state. +func TestProvisionRollback_Cache_Authenticated_FailedRow(t *testing.T) { + fake := &fakeProvisioner{failProvision: true} + fx := setupGRPCProvFixture(t, fake, false) + teamID := testhelpers.MustCreateTeamDB(t, fx.db, "pro") + jwt := authSessionJWT(t, fx.db, teamID) + + resp, body := doProvision(t, fx, "/cache/new", "10.72.0.1", jwt, map[string]any{"name": "cache-auth-fail"}) + defer resp.Body.Close() + require.Equal(t, http.StatusServiceUnavailable, resp.StatusCode) + require.Equal(t, "provision_failed", body.Error) + + var status string + require.NoError(t, fx.db.QueryRowContext(context.Background(), + `SELECT status FROM resources WHERE team_id = $1::uuid AND resource_type = 'redis' + ORDER BY created_at DESC LIMIT 1`, teamID).Scan(&status)) + assert.Equal(t, models.StatusFailed, status) +} + +// TestResourceDelete_Storage_ByRowID_AuditUsesToken drives the storage +// deprovision arm with a SUCCESSFUL revoke (hermetic minio-admin fake) under +// the id-addressed form, proving the IAM-removal audit row is keyed by the +// resource TOKEN (key_), not the path id. +func TestResourceDelete_Storage_ByRowID_AuditUsesToken(t *testing.T) { + db, clean := testhelpers.SetupTestDB(t) + defer clean() + rdb, rClean := testhelpers.SetupTestRedis(t) + defer rClean() + teamID := testhelpers.MustCreateTeamDB(t, db, "pro") + userID := uuid.NewString() + + h := handlers.NewResourceHandler(db, rdb, resourceResidualConfig(), plans.Default(), nil, prefixScopedProvider(t)) + app := resourceResidualApp(t, db, rdb, h, teamID, userID) + + token := seedTeamResource(t, db, teamID, "storage", "active") + var rowID string + require.NoError(t, db.QueryRowContext(context.Background(), + `SELECT id::text FROM resources WHERE token = $1`, token).Scan(&rowID)) + + req := httptest.NewRequest(http.MethodDelete, "/api/v1/resources/"+rowID, nil) + resp, err := app.Test(req, 10000) + require.NoError(t, err) + defer resp.Body.Close() + require.Equal(t, http.StatusOK, resp.StatusCode, "storage delete by row id must succeed") + + // The IAM audit emit is async (safego.Go) — poll briefly. + wantFragment := "key_" + token[:8] + deadline := time.Now().Add(3 * time.Second) + for { + var summary string + err := db.QueryRowContext(context.Background(), + `SELECT summary FROM audit_log WHERE team_id = $1::uuid AND kind = $2 + ORDER BY created_at DESC LIMIT 1`, teamID, models.AuditKindStorageIAMUserDeleted).Scan(&summary) + if err == nil { + assert.Contains(t, summary, wantFragment, + "IAM audit summary must reference the TOKEN-derived key, not the path id") + return + } + if time.Now().After(deadline) { + t.Fatalf("storage IAM audit row never appeared: %v", err) + } + time.Sleep(50 * time.Millisecond) + } +} diff --git a/internal/handlers/resource_env_mw_final3_test.go b/internal/handlers/resource_env_mw_final3_test.go index 6adefefc..1c3b699f 100644 --- a/internal/handlers/resource_env_mw_final3_test.go +++ b/internal/handlers/resource_env_mw_final3_test.go @@ -1,7 +1,7 @@ package handlers_test // resource_env_mw_final3_test.go — FINAL serial pass #3. Covers all three arms -// of ResourceEnvByTokenForMiddleware (env_policy_helpers.go): +// of ResourceEnvByTokenOrIDForMiddleware (env_policy_helpers.go): // - non-UUID :id → ("", nil) (line 32) // - token not found → ("", nil) fail-open (line 38) // - real resource → (env, nil) happy (line 40) @@ -27,15 +27,15 @@ func TestResourceEnvByTokenMWFinal3_Arms(t *testing.T) { teamID := testhelpers.MustCreateTeamDB(t, db, "pro") // Seed a real resource so the happy arm returns its env. - var token string + var rowID, token string require.NoError(t, db.QueryRowContext(context.Background(), `INSERT INTO resources (team_id, resource_type, tier, env, status) VALUES ($1::uuid, 'postgres', 'pro', 'staging', 'active') - RETURNING token::text`, teamID).Scan(&token)) + RETURNING id::text, token::text`, teamID).Scan(&rowID, &token)) app := fiber.New() app.Get("/r/:id", func(c *fiber.Ctx) error { - env, err := handlers.ResourceEnvByTokenForMiddleware(c, db) + env, err := handlers.ResourceEnvByTokenOrIDForMiddleware(c, db) if err != nil { return c.Status(http.StatusInternalServerError).SendString("err") } @@ -59,7 +59,12 @@ func TestResourceEnvByTokenMWFinal3_Arms(t *testing.T) { _, body = get(uuid.NewString()) assert.Empty(t, body) - // real resource → its env (line 40). + // real resource by token → its env (line 40). _, body = get(token) assert.Equal(t, "staging", body) + + // real resource by ROW ID → its env (Wave-2 A1: the id-addressed DELETE + // form must hit the same env-policy gate as the token form). + _, body = get(rowID) + assert.Equal(t, "staging", body) } diff --git a/internal/handlers/resourcestatus_conversion_test.go b/internal/handlers/resourcestatus_conversion_test.go index 2289e656..11f41b81 100644 --- a/internal/handlers/resourcestatus_conversion_test.go +++ b/internal/handlers/resourcestatus_conversion_test.go @@ -29,7 +29,7 @@ import ( // allRawStatuses is the set of status strings the resources table can // hold plus a junk value, so the equivalence check exercises the // unrecognised-value path too. -var allRawStatuses = []string{"active", "paused", "suspended", "expired", "deleted", "garbage"} +var allRawStatuses = []string{"active", "paused", "suspended", "failed", "expired", "deleted", "garbage"} // TestPauseStatusPredicate_EquivalentToOldLiterals covers resource.go // Pause: old code rejected with "already_paused" on Status == "paused" diff --git a/internal/handlers/storage.go b/internal/handlers/storage.go index 7b07578d..c8c3ba81 100644 --- a/internal/handlers/storage.go +++ b/internal/handlers/storage.go @@ -328,7 +328,7 @@ func (h *StorageHandler) NewStorage(c *fiber.Ctx) error { middleware.RecordProvisionFail("storage", middleware.ProvisionFailBackendUnavailable) slog.Error("storage.new.provision_failed", "error", err, "token", tokenStr, "request_id", requestID) - if delErr := models.SoftDeleteResource(ctx, h.db, resource.ID); delErr != nil { + if delErr := models.MarkResourceFailed(ctx, h.db, resource.ID); delErr != nil { slog.Error("storage.new.soft_delete_failed", "error", delErr, "resource_id", resource.ID) } return respondError(c, fiber.StatusServiceUnavailable, "provision_failed", "Failed to provision storage credentials") @@ -528,7 +528,7 @@ func (h *StorageHandler) newStorageAuthenticated( middleware.RecordProvisionFail("storage", middleware.ProvisionFailBackendUnavailable) slog.Error("storage.new.provision_failed_auth", "error", err, "token", tokenStr, "team_id", teamIDStr, "request_id", requestID) - if delErr := models.SoftDeleteResource(ctx, h.db, resource.ID); delErr != nil { + if delErr := models.MarkResourceFailed(ctx, h.db, resource.ID); delErr != nil { slog.Error("storage.new.soft_delete_failed_auth", "error", delErr, "resource_id", resource.ID) } return respondError(c, fiber.StatusServiceUnavailable, "provision_failed", "Failed to provision storage credentials") diff --git a/internal/handlers/vector.go b/internal/handlers/vector.go index 71365929..b67da705 100644 --- a/internal/handlers/vector.go +++ b/internal/handlers/vector.go @@ -360,7 +360,7 @@ func (h *VectorHandler) NewVector(c *fiber.Ctx) error { metrics.ProvisionFailures.WithLabelValues(models.ResourceTypeVector, "grpc_error").Inc() slog.Error("vector.new.provision_failed", "error", err, "token", tokenStr, "request_id", requestID) - if delErr := models.SoftDeleteResource(ctx, h.db, resource.ID); delErr != nil { + if delErr := models.MarkResourceFailed(ctx, h.db, resource.ID); delErr != nil { slog.Error("vector.new.soft_delete_failed", "error", delErr, "resource_id", resource.ID) } return respondProvisionFailed(c, err, "Failed to provision vector database") @@ -520,7 +520,7 @@ func (h *VectorHandler) newVectorAuthenticated( metrics.ProvisionFailures.WithLabelValues(models.ResourceTypeVector, "grpc_error").Inc() slog.Error("vector.new.provision_failed_auth", "error", err, "token", tokenStr, "team_id", teamIDStr, "request_id", requestID) - if delErr := models.SoftDeleteResource(ctx, h.db, resource.ID); delErr != nil { + if delErr := models.MarkResourceFailed(ctx, h.db, resource.ID); delErr != nil { slog.Error("vector.new.soft_delete_failed_auth", "error", delErr, "resource_id", resource.ID) } return respondProvisionFailed(c, err, "Failed to provision vector database") diff --git a/internal/models/coverage_resource_test.go b/internal/models/coverage_resource_test.go index f53c96c5..c1b1f98c 100644 --- a/internal/models/coverage_resource_test.go +++ b/internal/models/coverage_resource_test.go @@ -73,6 +73,22 @@ func TestMarkResourceActive_Branches(t *testing.T) { require.ErrorContains(t, MarkResourceActive(ctx, db3, uuid.New()), "boom") } +func TestMarkResourceFailed_Branches(t *testing.T) { + ctx := context.Background() + db, mock := newMock(t) + mock.ExpectExec(`UPDATE resources SET status = 'failed'`).WillReturnResult(sqlmock.NewResult(0, 1)) + require.NoError(t, MarkResourceFailed(ctx, db, uuid.New())) + + // Zero rows — the row was already moved out of 'pending' by another path. + db2, mock2 := newMock(t) + mock2.ExpectExec(`UPDATE resources SET status = 'failed'`).WillReturnResult(sqlmock.NewResult(0, 0)) + require.ErrorIs(t, MarkResourceFailed(ctx, db2, uuid.New()), ErrResourceNotPending) + + db3, mock3 := newMock(t) + mock3.ExpectExec(`UPDATE resources SET status = 'failed'`).WillReturnError(errors.New("boom")) + require.ErrorContains(t, MarkResourceFailed(ctx, db3, uuid.New()), "boom") +} + func TestCountActiveResourcesByTeamAndType_Branches(t *testing.T) { ctx := context.Background() db, mock := newMock(t) @@ -223,17 +239,6 @@ func TestSetWebhookHMACSecret_Branches(t *testing.T) { require.ErrorContains(t, SetWebhookHMACSecret(ctx, db3, uuid.New(), "x"), "boom") } -func TestSoftDeleteResource_Branches(t *testing.T) { - ctx := context.Background() - db, mock := newMock(t) - mock.ExpectExec(`UPDATE resources SET status = 'deleted'`).WillReturnResult(sqlmock.NewResult(0, 1)) - require.NoError(t, SoftDeleteResource(ctx, db, uuid.New())) - - db2, mock2 := newMock(t) - mock2.ExpectExec(`UPDATE resources SET status = 'deleted'`).WillReturnError(errors.New("boom")) - require.ErrorContains(t, SoftDeleteResource(ctx, db2, uuid.New()), "boom") -} - func TestSoftDeleteResourceIfActive_Branches(t *testing.T) { ctx := context.Background() diff --git a/internal/models/resource.go b/internal/models/resource.go index ff03f99a..0dd8ed23 100644 --- a/internal/models/resource.go +++ b/internal/models/resource.go @@ -197,6 +197,17 @@ const StatusPending = "pending" // StatusActive is the canonical "provisioned and usable" status. const StatusActive = "active" +// StatusFailed is the terminal status a resource row carries when the backend +// provision RPC failed or timed out and the handler rolled the provision back. +// Before Wave-2 A1 (2026-06-11) the rollback soft-deleted the row, so a +// timed-out provision VANISHED from every read surface and the caller had no +// terminal state to poll. A failed row is visible in lists (only 'deleted' is +// hidden), deletable via DELETE /api/v1/resources/:id, exempt from quota +// counts (those filter status='active'), and carries no live backing infra — +// the backend object either never existed or was torn down by the rollback's +// best-effort cleanup. Added by migration 070_resources_failed_status.sql. +const StatusFailed = "failed" + // CreateResource inserts a new resource row and returns it. // // MR-P0-2 (BugBash 2026-05-20): the row is inserted with status='pending', NOT @@ -271,6 +282,31 @@ func MarkResourceActive(ctx context.Context, db *sql.DB, id uuid.UUID) error { // that is not in the expected mid-provision state. var ErrResourceNotPending = fmt.Errorf("models: resource is not pending") +// MarkResourceFailed flips a resource from 'pending' → 'failed'. It is the +// provision-rollback counterpart of MarkResourceActive: the handler runs it +// when the backend provision RPC fails/times out, or when a post-RPC +// persistence step fails (finalizeProvision), so the caller is left with a +// pollable terminal row instead of a vanished one. +// +// The atomic `WHERE status = 'pending'` guard mirrors MarkResourceActive: a +// row some other path already moved out of 'pending' (a reconciler abandon, a +// concurrent delete) is NOT clobbered. Returns ErrResourceNotPending on a +// zero-row update so the caller can log the unexpected state; rollback callers +// treat that as best-effort (the 503 to the customer is unchanged either way). +func MarkResourceFailed(ctx context.Context, db *sql.DB, id uuid.UUID) error { + res, err := db.ExecContext(ctx, ` + UPDATE resources SET status = 'failed' WHERE id = $1 AND status = 'pending' + `, id) + if err != nil { + return fmt.Errorf("models.MarkResourceFailed: %w", err) + } + n, _ := res.RowsAffected() + if n == 0 { + return ErrResourceNotPending + } + return nil +} + // CountActiveResourcesByTeamAndType returns the number of active (non-deleted) // resources of the given type owned by a team. Used for plan limit enforcement. // Counts across ALL environments — plan limits apply per team, not per env. @@ -454,19 +490,6 @@ func SetWebhookHMACSecret(ctx context.Context, db *sql.DB, resourceID uuid.UUID, return nil } -// SoftDeleteResource marks a resource status as 'deleted'. Used by the -// provision-rollback paths (db/cache/nosql/queue/storage/vector), which always -// operate on a row they just created — they don't need the concurrency guard. -func SoftDeleteResource(ctx context.Context, db *sql.DB, id uuid.UUID) error { - _, err := db.ExecContext(ctx, ` - UPDATE resources SET status = 'deleted' WHERE id = $1 - `, id) - if err != nil { - return fmt.Errorf("models.SoftDeleteResource: %w", err) - } - return nil -} - // SoftDeleteResourceIfActive flips status to 'deleted' ONLY when the row is not // already deleted, returning whether THIS call performed the transition. // diff --git a/internal/provisioner/client.go b/internal/provisioner/client.go index f0cb5be5..ed3f6fb7 100644 --- a/internal/provisioner/client.go +++ b/internal/provisioner/client.go @@ -33,27 +33,40 @@ const ( provisionerCircuitCooldown = 30 * time.Second ) +// dedicatedProvisionReadyWait mirrors the provisioner's k8s-backend pod-ready +// timeouts (redisK8sReadyTO / k8sReadyTimeout / natsK8sReadyTO / +// mongoK8sReadyTO in provisioner/internal/backend/* — all 3 minutes): the +// documented budget a dedicated-pod provision may legitimately spend waiting +// for PVC bind + image pull + readiness. Every provision RPC deadline on this +// client MUST be ≥ this value plus provisionRPCDeadlineMargin — an RPC +// deadline that undercuts the backend's own ready-wait kills a provision that +// was still on track to succeed (and, post Wave-2 A1, strands a 'failed' row +// for a resource the provisioner then finishes bringing up). +// TestProvisionTimeouts_NeverUndercutDedicatedReadyWait enforces the +// invariant for every tier and every provision deadline in this file. +const dedicatedProvisionReadyWait = 3 * time.Minute + +// provisionRPCDeadlineMargin is the headroom an RPC deadline carries on top +// of dedicatedProvisionReadyWait, covering the non-pod-wait parts of the +// provision (credential mint, ACL setup, network round-trips). +const provisionRPCDeadlineMargin = 30 * time.Second + // cacheProvisionTimeout bounds the api → provisioner ProvisionCache RPC. // -// Unlike Postgres/Mongo, a Redis namespace provision does NOT spin up a -// per-tenant pod with a PVC bind + image pull + DB init — it carves a -// namespace/ACL out of an already-running Redis (observed ~1–6s end to -// end, anonymous AND authenticated). It therefore does not need the -// 4–5m cold-pod budget that provisionTimeout() grants the pod-backed -// resource types. -// -// Granting Redis that same multi-minute budget means a *hung* provisioner -// hangs the whole /cache/new request for up to 5 minutes (the symptom that -// motivated this fix: authed /cache/new observed hanging >60s while anon -// returned in ~6s). A generous-but-bounded 45s ceiling is ~7× the slowest -// observed healthy provision, so a legitimately slow-but-OK Redis carve -// still succeeds, while a genuine hang fails fast — the existing handler -// error path then soft-deletes the pending resource and returns a clean -// 503 provision_failed instead of an indefinite hang. +// History: this was a 45s "fail fast" ceiling from when a Redis provision was +// believed to always be a shared-instance namespace/ACL carve (~1–6s). In +// production REDIS_PROVISION_BACKEND=k8s provisions a dedicated Redis pod +// (and under tier-aware routing the Team tier always does), which the +// provisioner legitimately waits up to dedicatedProvisionReadyWait (3m) for — +// so the 45s RPC deadline undercut the backend's own ready-wait and killed +// cold-pod provisions mid-flight. Reconciled (Wave-2 A1): the deadline is now +// derived from the dedicated ready-wait plus margin, never below it. Hung- +// provisioner protection comes from this still-bounded deadline plus the +// shared circuit breaker (5 consecutive failures → ErrOpen in <1ms). // // A package var (not const) so the timeout→503 unit test can shrink it to -// a few ms instead of blocking the suite for 45s; production never mutates it. -var cacheProvisionTimeout = 45 * time.Second +// a few ms instead of blocking the suite; production never mutates it. +var cacheProvisionTimeout = dedicatedProvisionReadyWait + provisionRPCDeadlineMargin // Credentials matches the shape returned by local providers. type Credentials struct { @@ -282,7 +295,9 @@ func (c *Client) ctxWithAuth(ctx context.Context) context.Context { // every-tier change). PVC bind + image pull + postgres init can take 30-90s on // a cold node, so 10s (the old anonymous default) drops the connection while // the pod is still coming up. Anonymous gets a tight 4m budget; pro/team get -// 5m for larger images and bigger PVCs. +// 5m for larger images and bigger PVCs. Both budgets honour the floor of +// dedicatedProvisionReadyWait + provisionRPCDeadlineMargin (3m30s) — see the +// invariant on dedicatedProvisionReadyWait. func provisionTimeout(tier string) time.Duration { if tier == "pro" || tier == "team" || tier == "growth" { return 5 * time.Minute diff --git a/internal/provisioner/client_cov_test.go b/internal/provisioner/client_cov_test.go index 76e64b81..47a76468 100644 --- a/internal/provisioner/client_cov_test.go +++ b/internal/provisioner/client_cov_test.go @@ -425,27 +425,47 @@ func TestProvisionCache_SuccessAndError(t *testing.T) { } } -// TestCacheProvisionTimeout_Value pins the production deadline so a future -// edit that re-grants Redis the multi-minute cold-pod budget (re-introducing -// the /cache/new hang) reds the suite. +// TestCacheProvisionTimeout_Value pins the production deadline derivation: +// the cache RPC deadline must equal the provisioner's dedicated pod-ready +// budget plus margin. A bare literal here (the old 45s "Redis is always a +// shared carve" ceiling) undercut the provisioner's 3m k8s ready-wait and +// killed legitimate cold-pod provisions mid-flight (Wave-2 A1). func TestCacheProvisionTimeout_Value(t *testing.T) { - if cacheProvisionTimeout != 45*time.Second { - t.Errorf("cacheProvisionTimeout = %s; want 45s (generous-but-bounded Redis carve ceiling)", cacheProvisionTimeout) - } - // Must be well under provisionTimeout's pod-backed budget — that gap is - // the whole point: a hung provisioner fails /cache/new fast instead of - // hanging it for minutes. - if cacheProvisionTimeout >= provisionTimeout("pro") { - t.Errorf("cacheProvisionTimeout (%s) must be tighter than provisionTimeout(pro) (%s)", + if want := dedicatedProvisionReadyWait + provisionRPCDeadlineMargin; cacheProvisionTimeout != want { + t.Errorf("cacheProvisionTimeout = %s; want %s (dedicatedProvisionReadyWait + provisionRPCDeadlineMargin)", + cacheProvisionTimeout, want) + } + // Still bounded: must not balloon past provisionTimeout's pod-backed + // budget — a hung provisioner has to fail /cache/new within the same + // envelope as every other provision RPC. + if cacheProvisionTimeout > provisionTimeout("pro") { + t.Errorf("cacheProvisionTimeout (%s) must not exceed provisionTimeout(pro) (%s)", cacheProvisionTimeout, provisionTimeout("pro")) } } +// TestProvisionTimeouts_NeverUndercutDedicatedReadyWait enforces the Wave-2 A1 +// invariant for EVERY provision deadline in this package: no RPC deadline may +// undercut the provisioner's dedicated pod-ready wait (3m) plus margin. The +// tier list spans both provisionTimeout branches plus unknown-tier fallback so +// a future tier addition can't silently reintroduce an undercutting budget. +func TestProvisionTimeouts_NeverUndercutDedicatedReadyWait(t *testing.T) { + floor := dedicatedProvisionReadyWait + provisionRPCDeadlineMargin + for _, tier := range []string{"anonymous", "free", "hobby", "hobby_plus", "pro", "growth", "team", "some-future-tier"} { + if got := provisionTimeout(tier); got < floor { + t.Errorf("provisionTimeout(%q) = %s; must be >= %s (dedicated ready-wait + margin)", tier, got, floor) + } + } + if cacheProvisionTimeout < floor { + t.Errorf("cacheProvisionTimeout = %s; must be >= %s (dedicated ready-wait + margin)", cacheProvisionTimeout, floor) + } +} + // TestProvisionCache_HangBecomesDeadlineError proves the defensive fix: when // the provisioner hangs, ProvisionCache's own client-side deadline fires and // returns a DeadlineExceeded-class error in ~the bounded window — NOT an // indefinite block. This is the error the cache handler converts into a 503 -// provision_failed after soft-deleting the pending resource. +// provision_failed after marking the pending resource 'failed'. // // We shrink cacheProvisionTimeout to a few ms (restored via t.Cleanup) so the // test is fast, and give the *caller* context a generous timeout so the error diff --git a/internal/router/router.go b/internal/router/router.go index 6a82389f..9aa646d1 100644 --- a/internal/router/router.go +++ b/internal/router/router.go @@ -1050,7 +1050,7 @@ func NewWithHooks(cfg *config.Config, db *sql.DB, rdb *redis.Client, geoDbs *mid api.Delete("/resources/:id", middleware.RequireEnvAccess(middleware.EnvPolicyActionDeleteResource, middleware.WithEnvLookup(func(c *fiber.Ctx) (string, error) { - return handlers.ResourceEnvByTokenForMiddleware(c, db) + return handlers.ResourceEnvByTokenOrIDForMiddleware(c, db) }), ), resourceH.Delete, diff --git a/openapi.snapshot.json b/openapi.snapshot.json index 988b50ee..e7866fa1 100644 --- a/openapi.snapshot.json +++ b/openapi.snapshot.json @@ -4581,6 +4581,7 @@ "delete": { "parameters": [ { + "description": "Accepts EITHER the resource's 'id' (as returned by GET /api/v1/resources) OR its provision 'token'. Resolution tries token first, then id; authorization (team ownership) is identical for both forms.", "in": "path", "name": "id", "required": true, @@ -4596,6 +4597,9 @@ }, "403": { "description": "Forbidden — not your resource OR blocked by team env_policy. The env_policy variant carries body: { error: 'env_policy_denied', env, action, role, allowed_roles, agent_action }." + }, + "404": { + "description": "Not found — no resource with that id or token, or it belongs to another team." } }, "security": [ From 2be5e70bd3d837efede4e6fc4a3fba41cba51f60 Mon Sep 17 00:00:00 2001 From: Manas Srivastava Date: Thu, 11 Jun 2026 01:19:10 +0530 Subject: [PATCH 2/2] test(router): cover the DELETE /resources/:id env-lookup wiring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Wave-2 A1 rename of the WithEnvLookup closure target (ResourceEnvByTokenOrIDForMiddleware) made router.go's closure line a changed line under diff-cover, and it had no unit coverage (the route was only exercised by build-tagged E2E). New router-package test drives an authenticated DELETE by ROW ID through the real router — covering the closure, the id-fallback resolution, and the idempotent-retry path. Co-Authored-By: Claude Fable 5 --- .../router/resource_delete_env_lookup_test.go | 77 +++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 internal/router/resource_delete_env_lookup_test.go diff --git a/internal/router/resource_delete_env_lookup_test.go b/internal/router/resource_delete_env_lookup_test.go new file mode 100644 index 00000000..028ffe41 --- /dev/null +++ b/internal/router/resource_delete_env_lookup_test.go @@ -0,0 +1,77 @@ +package router_test + +// resource_delete_env_lookup_test.go — drives the DELETE /api/v1/resources/:id +// route through the REAL router so the WithEnvLookup closure wired in +// router.go (the `return handlers.ResourceEnvByTokenOrIDForMiddleware(c, db)` +// line) executes under unit coverage, not just E2E. Wave-2 A1 renamed that +// helper (token-or-id resolution); this test pins the wiring: +// +// - an authenticated DELETE addressed by the resource's ROW ID resolves +// through the env-lookup (id fallback) and soft-deletes the row, and +// - a second DELETE on the same (now-deleted) row reports idempotent +// success — proving the closure's fail-open path also routes through. + +import ( + "context" + "net/http" + "net/http/httptest" + "testing" + + "github.com/stretchr/testify/require" + + "instant.dev/internal/email" + "instant.dev/internal/plans" + "instant.dev/internal/router" + "instant.dev/internal/testhelpers" +) + +func TestRouter_ResourceDelete_ByRowID_EnvLookupWired(t *testing.T) { + db, dbClean := testhelpers.SetupTestDB(t) + defer dbClean() + rdb, rdbClean := testhelpers.SetupTestRedis(t) + defer rdbClean() + + cfg := newRouterTestConfig() + app := router.New(cfg, db, rdb, nil, email.NewNoop(), plans.Default(), nil, nil) + + // Seed a team + user + an active resource owned by the team. + teamID := testhelpers.MustCreateTeamDB(t, db, "pro") + var userID string + require.NoError(t, db.QueryRowContext(context.Background(), + `INSERT INTO users (team_id, email) VALUES ($1::uuid, $2) RETURNING id::text`, + teamID, testhelpers.UniqueEmail(t)).Scan(&userID)) + jwt := testhelpers.MustSignSessionJWT(t, userID, teamID, "router-del@example.com") + + var rowID string + require.NoError(t, db.QueryRowContext(context.Background(), + `INSERT INTO resources (team_id, resource_type, tier, env, status) + VALUES ($1::uuid, 'webhook', 'pro', 'staging', 'active') + RETURNING id::text`, teamID).Scan(&rowID)) + + doDelete := func() *http.Response { + req := httptest.NewRequest(http.MethodDelete, "/api/v1/resources/"+rowID, nil) + req.Header.Set("Authorization", "Bearer "+jwt) + resp, err := app.Test(req, 15000) + require.NoError(t, err) + return resp + } + + // DELETE by ROW ID through the real router: env-policy middleware runs + // the token-or-id env lookup, then the handler resolves + deletes. + resp := doDelete() + defer resp.Body.Close() + require.Equal(t, http.StatusOK, resp.StatusCode, + "DELETE by row id through the real router must succeed") + + var status string + require.NoError(t, db.QueryRowContext(context.Background(), + `SELECT status FROM resources WHERE id = $1::uuid`, rowID).Scan(&status)) + require.Equal(t, "deleted", status) + + // Idempotent retry: the row is now 'deleted'; the env lookup still runs + // (resolves the deleted row) and the handler reports already_deleted. + resp2 := doDelete() + defer resp2.Body.Close() + require.Equal(t, http.StatusOK, resp2.StatusCode, + "repeat DELETE must be idempotent success") +}