Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion docs/openapi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -538,7 +538,17 @@ paths:
security:
- BearerAuth: []
parameters:
- $ref: "#/components/parameters/id"
- name: id
in: path
required: true
description: >-
Accepts EITHER the resource's `id` (as returned by
GET /api/v1/resources) OR its provision `token`. Resolution tries
token first, then id; authorization (team ownership) is identical
for both forms.
schema:
type: string
format: uuid
responses:
"200":
description: Resource deleted.
Expand Down
11 changes: 6 additions & 5 deletions internal/db/migrations/024_resources_paused_status.sql
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,12 @@ ALTER TABLE resources
ADD CONSTRAINT resources_status_check
-- Forward-consistent full status set (incident 2026-06-10). The migration
-- runner RE-APPLIES every migration on each boot; a NARROW constraint here
-- (missing 'suspended' [added in 049] / 'pending' [added in 057]) crashes
-- the boot the moment a row already holds one of those later-added — but
-- valid — statuses. Re-adding the canonical set makes 024 safe to re-run
-- regardless of data. (024/049/057 now all define the same set.)
CHECK (status IN ('pending', 'active', 'paused', 'suspended', 'expired', 'deleted', 'reaped'));
-- (missing 'suspended' [added in 049] / 'pending' [added in 057] /
-- 'failed' [added in 070]) crashes the boot the moment a row already holds
-- one of those later-added — but valid — statuses. Re-adding the canonical
-- set makes 024 safe to re-run regardless of data. (024/049/057/070 now
-- all define the same set.)
CHECK (status IN ('pending', 'active', 'paused', 'suspended', 'failed', 'expired', 'deleted', 'reaped'));

ALTER TABLE resources ADD COLUMN IF NOT EXISTS paused_at TIMESTAMPTZ;

Expand Down
7 changes: 4 additions & 3 deletions internal/db/migrations/049_resources_suspended_status.sql
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,10 @@ ALTER TABLE resources DROP CONSTRAINT IF EXISTS resources_status_check;
ALTER TABLE resources
ADD CONSTRAINT resources_status_check
-- Forward-consistent full status set (incident 2026-06-10): include 'pending'
-- (added in 057) so re-applying 049 on boot can't crash on a valid pending
-- row before 057 runs. 024/049/057 now all define the same canonical set.
CHECK (status IN ('pending', 'active', 'paused', 'suspended', 'expired', 'deleted', 'reaped'));
-- (added in 057) and 'failed' (added in 070) so re-applying 049 on boot
-- can't crash on a valid later-added row before its own migration runs.
-- 024/049/057/070 now all define the same canonical set.
CHECK (status IN ('pending', 'active', 'paused', 'suspended', 'failed', 'expired', 'deleted', 'reaped'));

-- Partial index for the auto-unsuspend scan.
-- EnforceStorageQuotaWorker scans WHERE status = 'suspended' on every run to
Expand Down
5 changes: 4 additions & 1 deletion internal/db/migrations/057_resources_pending_status.sql
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,10 @@
ALTER TABLE resources DROP CONSTRAINT IF EXISTS resources_status_check;
ALTER TABLE resources
ADD CONSTRAINT resources_status_check
CHECK (status IN ('pending', 'active', 'paused', 'suspended', 'expired', 'deleted', 'reaped'));
-- Forward-consistent full status set (incident 2026-06-10): include 'failed'
-- (added in 070) so re-applying 057 on boot can't crash on a valid failed
-- row before 070 runs. 024/049/057/070 now all define the same canonical set.
CHECK (status IN ('pending', 'active', 'paused', 'suspended', 'failed', 'expired', 'deleted', 'reaped'));

-- idx_resources_pending_sweep (the partial index the reconciler scans) was
-- already created by migration 030_resource_heartbeat.sql — it indexes
Expand Down
51 changes: 51 additions & 0 deletions internal/db/migrations/070_resources_failed_status.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
-- Migration: 070_resources_failed_status
--
-- Add 'failed' as a permitted value in the resources.status CHECK constraint.
--
-- Background (Wave-2 A1, cross-interface durability sweep 2026-06-11):
-- When the synchronous backend provision RPC fails or times out, every
-- provision handler used to ROLL BACK the just-created 'pending' row by
-- soft-deleting it (status='deleted'). From the caller's point of view the
-- resource simply VANISHED: a timed-out /db/new (or /cache/new, /nosql/new,
-- /queue/new, /vector/new, /storage/new, twin provision) returned 503 and
-- any follow-up GET on the token 404'd because the public read surface
-- hides deleted rows. Agents polling for an in-flight provision had no
-- terminal state to observe.
--
-- The handlers now mark the row status='failed' instead
-- (models.MarkResourceFailed, pending→failed). A failed row is:
-- * visible in GET /api/v1/resources (lists filter only status='deleted')
-- * deletable via DELETE /api/v1/resources/:id (the status!='deleted'
-- guard in SoftDeleteResourceIfActive admits it)
-- * NOT counted toward plan quotas (quota counts filter status='active')
-- * never swept by the provisioner_reconciler (it keys on 'pending') or
-- the TTL reaper (ReapableStatuses() = active/paused/suspended) — the
-- backend object either never existed or was already torn down by the
-- rollback's best-effort cleanup, so there is nothing to deprovision.
--
-- Without this migration every MarkResourceFailed UPDATE would hit
-- constraint-violation 23514 and the rollback path itself would error.
--
-- Status semantics (updated):
-- pending — row inserted, backend provision RPC + URL persistence not yet
-- complete; the transient mid-provision state. NOT usable.
-- active — provisioned, accepting connections (or status-only for queue/storage/webhook)
-- paused — user-initiated pause (Pro+ only); infra revoked; data preserved
-- suspended — system-initiated suspend on storage quota breach; infra revoked
-- failed — terminal: the backend provision RPC failed/timed out and the
-- rollback kept the row as a pollable terminal state. No live
-- backing infra. Visible in lists; deletable; quota-exempt.
-- expired — TTL reached (anonymous resources); soft-deleted equivalent for anon
-- deleted — user-deleted (permanent credentials removed)
-- reaped — legacy: worker-reaped before 'deleted' was the canonical term
--
-- Idempotent: DROP IF EXISTS + re-ADD with the same syntax, so re-running on a
-- schema that already applied this migration is harmless. Migrations 024/049/057
-- were updated in the same change to define this same canonical set (the
-- migration runner RE-APPLIES every migration on each boot — see the
-- forward-consistency note in 024, incident 2026-06-10).

ALTER TABLE resources DROP CONSTRAINT IF EXISTS resources_status_check;
ALTER TABLE resources
ADD CONSTRAINT resources_status_check
CHECK (status IN ('pending', 'active', 'paused', 'suspended', 'failed', 'expired', 'deleted', 'reaped'));
2 changes: 1 addition & 1 deletion internal/db/postgres_migrations_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -511,7 +511,7 @@ func TestConnectRedis_PanicsOnUnreachable(t *testing.T) {
// re-applying it mid-sequence can never reject a valid row. No DB needed —
// reads the embedded SQL via the same seam the runner uses.
func TestMigrations_ResourcesStatusCheck_ForwardConsistent(t *testing.T) {
canonical := []string{"pending", "active", "paused", "suspended", "expired", "deleted", "reaped"}
canonical := []string{"pending", "active", "paused", "suspended", "failed", "expired", "deleted", "reaped"}
checked := 0
for _, name := range MigrationFiles() {
b, err := readMigrationFile(name)
Expand Down
9 changes: 5 additions & 4 deletions internal/handlers/cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -221,8 +221,9 @@ func (h *CacheHandler) NewCache(c *fiber.Ctx) error {
middleware.RecordProvisionFail("redis", middleware.ProvisionFailBackendUnavailable)
slog.Error("cache.new.provision_failed",
"error", err, "token", tokenStr, "request_id", requestID)
// Soft-delete the resource record so limits aren't falsely consumed.
if delErr := models.SoftDeleteResource(ctx, h.db, resource.ID); delErr != nil {
// Mark the pending row 'failed' — a pollable terminal state. Failed
// rows never count against quota (counts filter status='active').
if delErr := models.MarkResourceFailed(ctx, h.db, resource.ID); delErr != nil {
slog.Error("cache.new.soft_delete_failed", "error", delErr, "resource_id", resource.ID)
}
return respondProvisionFailed(c, err, "Failed to provision Redis namespace")
Expand Down Expand Up @@ -387,7 +388,7 @@ func (h *CacheHandler) newCacheAuthenticated(
middleware.RecordProvisionFail("redis", middleware.ProvisionFailBackendUnavailable)
slog.Error("cache.new.provision_failed_auth",
"error", err, "token", tokenStr, "team_id", teamIDStr, "request_id", requestID)
if delErr := models.SoftDeleteResource(ctx, h.db, resource.ID); delErr != nil {
if delErr := models.MarkResourceFailed(ctx, h.db, resource.ID); delErr != nil {
slog.Error("cache.new.soft_delete_failed_auth", "error", delErr, "resource_id", resource.ID)
}
return respondProvisionFailed(c, err, "Failed to provision Redis namespace")
Expand Down Expand Up @@ -563,7 +564,7 @@ func (h *CacheHandler) ProvisionForTwinCore(ctx context.Context, in ProvisionFor
middleware.RecordProvisionFail(models.ResourceTypeRedis, middleware.ProvisionFailBackendUnavailable)
slog.Error("twin.cache.provision_failed",
"error", err, "token", tokenStr, "team_id", in.TeamID, "request_id", in.RequestID)
if delErr := models.SoftDeleteResource(ctx, h.db, resource.ID); delErr != nil {
if delErr := models.MarkResourceFailed(ctx, h.db, resource.ID); delErr != nil {
slog.Error("twin.cache.soft_delete_failed",
"error", delErr, "resource_id", resource.ID, "request_id", in.RequestID)
}
Expand Down
14 changes: 12 additions & 2 deletions internal/handlers/coverage_provisioner_grpc_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,8 @@ type fakeProvisioner struct {
failProvision bool
deprovisionCalls int

lastReq *provisionerv1.ProvisionRequest
lastReq *provisionerv1.ProvisionRequest
lastDeprovisionReq *provisionerv1.DeprovisionRequest
}

func (f *fakeProvisioner) ProvisionResource(_ context.Context, req *provisionerv1.ProvisionRequest) (*provisionerv1.ProvisionResponse, error) {
Expand Down Expand Up @@ -113,13 +114,21 @@ func (f *fakeProvisioner) ProvisionResource(_ context.Context, req *provisionerv
}
}

func (f *fakeProvisioner) DeprovisionResource(_ context.Context, _ *provisionerv1.DeprovisionRequest) (*provisionerv1.DeprovisionResponse, error) {
func (f *fakeProvisioner) DeprovisionResource(_ context.Context, req *provisionerv1.DeprovisionRequest) (*provisionerv1.DeprovisionResponse, error) {
f.mu.Lock()
f.deprovisionCalls++
f.lastDeprovisionReq = req
f.mu.Unlock()
return &provisionerv1.DeprovisionResponse{}, nil
}

// lastDeprovision returns the most recent DeprovisionRequest (nil if none).
func (f *fakeProvisioner) lastDeprovision() *provisionerv1.DeprovisionRequest {
f.mu.Lock()
defer f.mu.Unlock()
return f.lastDeprovisionReq
}

func (f *fakeProvisioner) deprovisionCount() int {
f.mu.Lock()
defer f.mu.Unlock()
Expand Down Expand Up @@ -231,6 +240,7 @@ func setupGRPCProvFixture(t *testing.T, fake *fakeProvisioner, badAESKey bool) g

middleware.SetRoleLookupDB(db)
api := app.Group("/api/v1", middleware.RequireAuth(cfg), middleware.PopulateTeamRole())
api.Get("/resources", resourceH.List)
api.Get("/resources/:id", resourceH.Get)
api.Delete("/resources/:id", resourceH.Delete)
api.Post("/resources/:id/provision-twin", twinH.ProvisionTwin)
Expand Down
6 changes: 3 additions & 3 deletions internal/handlers/db.go
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ func (h *DBHandler) NewDB(c *fiber.Ctx) error {
middleware.RecordProvisionFail("postgres", middleware.ProvisionFailBackendUnavailable)
slog.Error("db.new.provision_failed",
"error", err, "token", tokenStr, "request_id", requestID)
if delErr := models.SoftDeleteResource(ctx, h.db, resource.ID); delErr != nil {
if delErr := models.MarkResourceFailed(ctx, h.db, resource.ID); delErr != nil {
slog.Error("db.new.soft_delete_failed", "error", delErr, "resource_id", resource.ID)
}
return respondProvisionFailed(c, err, "Failed to provision Postgres database")
Expand Down Expand Up @@ -438,7 +438,7 @@ func (h *DBHandler) newDBAuthenticated(
middleware.RecordProvisionFail("postgres", middleware.ProvisionFailBackendUnavailable)
slog.Error("db.new.provision_failed_auth",
"error", err, "token", tokenStr, "team_id", teamIDStr, "request_id", requestID)
if delErr := models.SoftDeleteResource(ctx, h.db, resource.ID); delErr != nil {
if delErr := models.MarkResourceFailed(ctx, h.db, resource.ID); delErr != nil {
slog.Error("db.new.soft_delete_failed_auth", "error", delErr, "resource_id", resource.ID)
}
return respondProvisionFailed(c, err, "Failed to provision Postgres database")
Expand Down Expand Up @@ -647,7 +647,7 @@ func (h *DBHandler) ProvisionForTwinCore(ctx context.Context, in ProvisionForTwi
middleware.RecordProvisionFail(models.ResourceTypePostgres, middleware.ProvisionFailBackendUnavailable)
slog.Error("twin.db.provision_failed",
"error", err, "token", tokenStr, "team_id", in.TeamID, "request_id", in.RequestID)
if delErr := models.SoftDeleteResource(ctx, h.db, resource.ID); delErr != nil {
if delErr := models.MarkResourceFailed(ctx, h.db, resource.ID); delErr != nil {
slog.Error("twin.db.soft_delete_failed",
"error", delErr, "resource_id", resource.ID, "request_id", in.RequestID)
}
Expand Down
21 changes: 12 additions & 9 deletions internal/handlers/env_policy_helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,24 +14,27 @@ import (

"github.com/gofiber/fiber/v2"
"github.com/google/uuid"
"instant.dev/internal/models"
)

// ResourceEnvByTokenForMiddleware reads the env stored on a resource row
// addressed by the URL :id param (a public token UUID). Returns the env on
// success or "" on any error — the env-policy middleware fails OPEN on
// lookup error so a malformed/non-existent :id falls through to the
// handler's own 400/404 instead of a confusing 403/env_policy_denied.
// ResourceEnvByTokenOrIDForMiddleware reads the env stored on a resource row
// addressed by the URL :id param — resolved first as a public token UUID,
// then as the row's primary-key id (resolveResourceByTokenOrID), matching
// the DELETE handler's own resolution so the env-policy gate covers BOTH
// address forms (without the id fallback an id-addressed DELETE would skip
// env-policy enforcement entirely). Returns the env on success or "" on any
// error — the env-policy middleware fails OPEN on lookup error so a
// malformed/non-existent :id falls through to the handler's own 400/404
// instead of a confusing 403/env_policy_denied.
//
// Exported with the verbose suffix so its single intended caller (the
// router wiring) is unambiguous; this is not a general-purpose helper.
func ResourceEnvByTokenForMiddleware(c *fiber.Ctx, db *sql.DB) (string, error) {
func ResourceEnvByTokenOrIDForMiddleware(c *fiber.Ctx, db *sql.DB) (string, error) {
tokenStr := c.Params("id")
token, err := uuid.Parse(tokenStr)
pathUUID, err := uuid.Parse(tokenStr)
if err != nil {
return "", nil
}
r, err := models.GetResourceByToken(c.Context(), db, token)
r, err := resolveResourceByTokenOrID(c.Context(), db, pathUUID)
if err != nil {
// Including ErrResourceNotFound — fail open so the handler returns
// its own 404 (which contains a stable, agent-readable shape).
Expand Down
2 changes: 1 addition & 1 deletion internal/handlers/env_policy_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ func envPolicyApp(t *testing.T, db *sql.DB) *fiber.App {
api.Delete("/resources/:id",
middleware.RequireEnvAccess(middleware.EnvPolicyActionDeleteResource,
middleware.WithEnvLookup(func(c *fiber.Ctx) (string, error) {
return handlers.ResourceEnvByTokenForMiddleware(c, db)
return handlers.ResourceEnvByTokenOrIDForMiddleware(c, db)
}),
),
func(c *fiber.Ctx) error { return c.JSON(fiber.Map{"ok": true}) },
Expand Down
14 changes: 8 additions & 6 deletions internal/handlers/finalize_provision_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,16 +77,18 @@ func TestFinalizeProvision_PersistenceFailure_ReturnsErrorAndRunsCleanup(t *test
"finalizeProvision must run the cleanup closure on persistence failure to tear down "+
"the just-provisioned backend object; otherwise the platform leaks an orphan")

// 3. Row is soft-deleted (status='deleted'), NOT left at 'pending' or
// 'active'. A pending row would be picked up by the reconciler; an
// active row would falsely advertise itself as usable in dashboard
// listings and quota counts.
// 3. Row is marked failed (status='failed'), NOT left at 'pending' or
// 'active', and NOT soft-deleted. A pending row would be picked up by
// the reconciler; an active row would falsely advertise itself as
// usable in dashboard listings and quota counts; a deleted row would
// VANISH from the caller's read surface (the pre-Wave-2-A1 behaviour)
// leaving no pollable terminal state.
var status string
require.NoError(t, dbConn.QueryRow(
`SELECT status FROM resources WHERE id = $1`, res.ID,
).Scan(&status))
assert.Equal(t, "deleted", status,
"on a persistence failure the row must be soft-deleted so it doesn't leak as an orphan")
assert.Equal(t, models.StatusFailed, status,
"on a persistence failure the row must be marked 'failed' — a pollable terminal state")
}

// TestFinalizeProvision_Success_FlipsToActive is the happy-path guard:
Expand Down
9 changes: 5 additions & 4 deletions internal/handlers/nosql.go
Original file line number Diff line number Diff line change
Expand Up @@ -217,8 +217,9 @@ func (h *NoSQLHandler) NewNoSQL(c *fiber.Ctx) error {
middleware.RecordProvisionFail("mongodb", middleware.ProvisionFailBackendUnavailable)
slog.Error("nosql.new.provision_failed",
"error", err, "token", tokenStr, "request_id", requestID)
// Soft-delete the resource record so limits aren't falsely consumed.
if delErr := models.SoftDeleteResource(ctx, h.db, resource.ID); delErr != nil {
// Mark the pending row 'failed' — a pollable terminal state. Failed
// rows never count against quota (counts filter status='active').
if delErr := models.MarkResourceFailed(ctx, h.db, resource.ID); delErr != nil {
slog.Error("nosql.new.soft_delete_failed", "error", delErr, "resource_id", resource.ID)
}
return respondProvisionFailed(c, err, "Failed to provision MongoDB database")
Expand Down Expand Up @@ -379,7 +380,7 @@ func (h *NoSQLHandler) newNoSQLAuthenticated(
middleware.RecordProvisionFail("mongodb", middleware.ProvisionFailBackendUnavailable)
slog.Error("nosql.new.provision_failed_auth",
"error", err, "token", tokenStr, "team_id", teamIDStr, "request_id", requestID)
if delErr := models.SoftDeleteResource(ctx, h.db, resource.ID); delErr != nil {
if delErr := models.MarkResourceFailed(ctx, h.db, resource.ID); delErr != nil {
slog.Error("nosql.new.soft_delete_failed_auth", "error", delErr, "resource_id", resource.ID)
}
return respondProvisionFailed(c, err, "Failed to provision MongoDB database")
Expand Down Expand Up @@ -567,7 +568,7 @@ func (h *NoSQLHandler) ProvisionForTwinCore(ctx context.Context, in ProvisionFor
middleware.RecordProvisionFail(models.ResourceTypeMongoDB, middleware.ProvisionFailBackendUnavailable)
slog.Error("twin.nosql.provision_failed",
"error", err, "token", tokenStr, "team_id", in.TeamID, "request_id", in.RequestID)
if delErr := models.SoftDeleteResource(ctx, h.db, resource.ID); delErr != nil {
if delErr := models.MarkResourceFailed(ctx, h.db, resource.ID); delErr != nil {
slog.Error("twin.nosql.soft_delete_failed",
"error", delErr, "resource_id", resource.ID, "request_id", in.RequestID)
}
Expand Down
5 changes: 3 additions & 2 deletions internal/handlers/openapi.go
Original file line number Diff line number Diff line change
Expand Up @@ -882,10 +882,11 @@ const openAPISpec = `{
"delete": {
"summary": "Delete a resource",
"security": [{ "bearerAuth": [] }],
"parameters": [{ "name": "id", "in": "path", "required": true, "schema": { "type": "string", "format": "uuid" } }],
"parameters": [{ "name": "id", "in": "path", "required": true, "schema": { "type": "string", "format": "uuid" }, "description": "Accepts EITHER the resource's 'id' (as returned by GET /api/v1/resources) OR its provision 'token'. Resolution tries token first, then id; authorization (team ownership) is identical for both forms." }],
"responses": {
"200": { "description": "Resource deleted" },
"403": { "description": "Forbidden — not your resource OR blocked by team env_policy. The env_policy variant carries body: { error: 'env_policy_denied', env, action, role, allowed_roles, agent_action }." }
"403": { "description": "Forbidden — not your resource OR blocked by team env_policy. The env_policy variant carries body: { error: 'env_policy_denied', env, action, role, allowed_roles, agent_action }." },
"404": { "description": "Not found — no resource with that id or token, or it belongs to another team." }
}
}
},
Expand Down
Loading
Loading